diff options
165 files changed, 8329 insertions, 5413 deletions
diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt index 9dce540771fb..3c18001dfd5d 100644 --- a/Documentation/devicetree/bindings/mmc/mmc.txt +++ b/Documentation/devicetree/bindings/mmc/mmc.txt @@ -38,6 +38,8 @@ Optional properties: - mmc-highspeed-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported - mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported - mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported +- mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported +- mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported *NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line polarity properties, we have to fix the meaning of the "normal" and "inverted" diff --git a/Documentation/devicetree/bindings/mmc/moxa,moxart-mmc.txt b/Documentation/devicetree/bindings/mmc/moxa,moxart-mmc.txt new file mode 100644 index 000000000000..b63819149f22 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/moxa,moxart-mmc.txt @@ -0,0 +1,30 @@ +MOXA ART MMC Host Controller Interface + + Inherits from mmc binding[1]. + + [1] Documentation/devicetree/bindings/mmc/mmc.txt + +Required properties: + +- compatible : Must be "moxa,moxart-mmc" or "faraday,ftsdc010" +- reg : Should contain registers location and length +- interrupts : Should contain the interrupt number +- clocks : Should contain phandle for the clock feeding the MMC controller + +Optional properties: + +- dmas : Should contain two DMA channels, line request number must be 5 for + both channels +- dma-names : Must be "tx", "rx" + +Example: + + mmc: mmc@98e00000 { + compatible = "moxa,moxart-mmc"; + reg = <0x98e00000 0x5C>; + interrupts = <5 0>; + clocks = <&clk_apb>; + dmas = <&dma 5>, + <&dma 5>; + dma-names = "tx", "rx"; + }; diff --git a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt index 8f3f13315358..2d4a7258a10d 100644 --- a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt +++ b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt @@ -69,10 +69,6 @@ Optional properties: * supports-highspeed: Enables support for high speed cards (up to 50MHz) -* caps2-mmc-hs200-1_8v: Supports mmc HS200 SDR 1.8V mode - -* caps2-mmc-hs200-1_2v: Supports mmc HS200 SDR 1.2V mode - * broken-cd: as documented in mmc core bindings. * vmmc-supply: The phandle to the regulator to use for vmmc. If this is @@ -103,7 +99,6 @@ board specific portions as listed below. clock-freq-min-max = <400000 200000000>; num-slots = <1>; supports-highspeed; - caps2-mmc-hs200-1_8v; broken-cd; fifo-depth = <0x80>; card-detect-delay = <200>; diff --git a/Documentation/devicetree/bindings/mmc/usdhi6rol0.txt b/Documentation/devicetree/bindings/mmc/usdhi6rol0.txt new file mode 100644 index 000000000000..8babdaa8623b --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/usdhi6rol0.txt @@ -0,0 +1,33 @@ +* Renesas usdhi6rol0 SD/SDIO host controller + +Required properties: + +- compatible: must be + "renesas,usdhi6rol0" +- interrupts: 3 interrupts, named "card detect", "data" and "SDIO" must be + specified +- clocks: a clock binding for the IMCLK input + +Optional properties: + +- vmmc-supply: a phandle of a regulator, supplying Vcc to the card +- vqmmc-supply: a phandle of a regulator, supplying VccQ to the card + +Additionally any standard mmc bindings from mmc.txt can be used. + +Example: + +sd0: sd@ab000000 { + compatible = "renesas,usdhi6rol0"; + reg = <0xab000000 0x200>; + interrupts = <0 23 0x4 + 0 24 0x4 + 0 25 0x4>; + interrupt-names = "card detect", "data", "SDIO"; + bus-width = <4>; + max-frequency = <50000000>; + cap-power-off-card; + clocks = <&imclk>; + vmmc-supply = <&vcc_sd0>; + vqmmc-supply = <&vccq_sd0>; +}; diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index b930ad087780..c49cd7e796e7 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt @@ -176,7 +176,5 @@ Nonstandard compound limitations: ca_maxrequestsize request and a ca_maxresponsesize reply, so we may fail to live up to the promise we made in CREATE_SESSION fore channel negotiation. -* No more than one read-like operation allowed per compound; encoding - replies that cross page boundaries (except for read data) not handled. See also http://wiki.linux-nfs.org/wiki/index.php/Server_4.0_and_4.1_issues. diff --git a/MAINTAINERS b/MAINTAINERS index 948379508e44..b4a66b9d6b4d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5974,6 +5974,7 @@ M: Chris Ball <chris@printf.net> M: Ulf Hansson <ulf.hansson@linaro.org> L: linux-mmc@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/cjb/mmc.git +T: git git://git.linaro.org/people/ulf.hansson/mmc.git S: Maintained F: drivers/mmc/ F: include/linux/mmc/ @@ -9103,7 +9104,7 @@ F: include/linux/toshiba.h F: include/uapi/linux/toshiba.h TMIO MMC DRIVER -M: Ian Molton <ian@mnementh.co.uk> +M: Ian Molton <ian.molton@codethink.co.uk> L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/tmio_mmc* diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 824644875d41..d2dbf02022bd 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -341,16 +341,17 @@ int mmc_add_card(struct mmc_card *card) if (mmc_host_is_spi(card->host)) { pr_info("%s: new %s%s%s card on SPI\n", mmc_hostname(card->host), - mmc_card_highspeed(card) ? "high speed " : "", - mmc_card_ddr_mode(card) ? "DDR " : "", + mmc_card_hs(card) ? "high speed " : "", + mmc_card_ddr52(card) ? "DDR " : "", type); } else { pr_info("%s: new %s%s%s%s%s card at address %04x\n", mmc_hostname(card->host), mmc_card_uhs(card) ? "ultra high speed " : - (mmc_card_highspeed(card) ? "high speed " : ""), + (mmc_card_hs(card) ? "high speed " : ""), + mmc_card_hs400(card) ? "HS400 " : (mmc_card_hs200(card) ? "HS200 " : ""), - mmc_card_ddr_mode(card) ? "DDR " : "", + mmc_card_ddr52(card) ? "DDR " : "", uhs_bus_speed_mode, type, card->rca); } diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index acbc3f2aaaf9..7dc0c85fdb60 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -800,6 +800,10 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card) data->timeout_ns = limit_us * 1000; data->timeout_clks = 0; } + + /* assign limit value if invalid */ + if (timeout_us == 0) + data->timeout_ns = limit_us * 1000; } /* @@ -1310,31 +1314,38 @@ int mmc_regulator_set_ocr(struct mmc_host *mmc, } EXPORT_SYMBOL_GPL(mmc_regulator_set_ocr); +#endif /* CONFIG_REGULATOR */ + int mmc_regulator_get_supply(struct mmc_host *mmc) { struct device *dev = mmc_dev(mmc); - struct regulator *supply; int ret; - supply = devm_regulator_get(dev, "vmmc"); - mmc->supply.vmmc = supply; + mmc->supply.vmmc = devm_regulator_get_optional(dev, "vmmc"); mmc->supply.vqmmc = devm_regulator_get_optional(dev, "vqmmc"); - if (IS_ERR(supply)) - return PTR_ERR(supply); + if (IS_ERR(mmc->supply.vmmc)) { + if (PTR_ERR(mmc->supply.vmmc) == -EPROBE_DEFER) + return -EPROBE_DEFER; + dev_info(dev, "No vmmc regulator found\n"); + } else { + ret = mmc_regulator_get_ocrmask(mmc->supply.vmmc); + if (ret > 0) + mmc->ocr_avail = ret; + else + dev_warn(dev, "Failed getting OCR mask: %d\n", ret); + } - ret = mmc_regulator_get_ocrmask(supply); - if (ret > 0) - mmc->ocr_avail = ret; - else - dev_warn(mmc_dev(mmc), "Failed getting OCR mask: %d\n", ret); + if (IS_ERR(mmc->supply.vqmmc)) { + if (PTR_ERR(mmc->supply.vqmmc) == -EPROBE_DEFER) + return -EPROBE_DEFER; + dev_info(dev, "No vqmmc regulator found\n"); + } return 0; } EXPORT_SYMBOL_GPL(mmc_regulator_get_supply); -#endif /* CONFIG_REGULATOR */ - /* * Mask off any voltages we don't support and select * the lowest voltage @@ -1533,8 +1544,13 @@ void mmc_power_up(struct mmc_host *host, u32 ocr) host->ios.timing = MMC_TIMING_LEGACY; mmc_set_ios(host); - /* Set signal voltage to 3.3V */ - __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_330); + /* Try to set signal voltage to 3.3V but fall back to 1.8v or 1.2v */ + if (__mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_330) == 0) + dev_dbg(mmc_dev(host), "Initial signal voltage of 3.3v\n"); + else if (__mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180) == 0) + dev_dbg(mmc_dev(host), "Initial signal voltage of 1.8v\n"); + else if (__mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120) == 0) + dev_dbg(mmc_dev(host), "Initial signal voltage of 1.2v\n"); /* * This delay should be sufficient to allow the power supply @@ -2183,7 +2199,7 @@ int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen) { struct mmc_command cmd = {0}; - if (mmc_card_blockaddr(card) || mmc_card_ddr_mode(card)) + if (mmc_card_blockaddr(card) || mmc_card_ddr52(card)) return 0; cmd.opcode = MMC_SET_BLOCKLEN; @@ -2263,7 +2279,6 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check) } } - host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_DDR); if (mmc_host_is_spi(host)) { host->ios.chip_select = MMC_CS_HIGH; host->ios.bus_mode = MMC_BUSMODE_PUSHPULL; @@ -2403,6 +2418,11 @@ void mmc_rescan(struct work_struct *work) container_of(work, struct mmc_host, detect.work); int i; + if (host->trigger_card_event && host->ops->card_event) { + host->ops->card_event(host); + host->trigger_card_event = false; + } + if (host->rescan_disable) return; diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c index 54829c0ed000..91eb16223246 100644 --- a/drivers/mmc/core/debugfs.c +++ b/drivers/mmc/core/debugfs.c @@ -135,8 +135,14 @@ static int mmc_ios_show(struct seq_file *s, void *data) case MMC_TIMING_UHS_DDR50: str = "sd uhs DDR50"; break; + case MMC_TIMING_MMC_DDR52: + str = "mmc DDR52"; + break; case MMC_TIMING_MMC_HS200: - str = "mmc high-speed SDR200"; + str = "mmc HS200"; + break; + case MMC_TIMING_MMC_HS400: + str = "mmc HS400"; break; default: str = "invalid"; diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fdea825dbb24..95cceae96944 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -447,6 +447,10 @@ int mmc_of_parse(struct mmc_host *host) host->caps2 |= MMC_CAP2_HS200_1_8V_SDR; if (of_find_property(np, "mmc-hs200-1_2v", &len)) host->caps2 |= MMC_CAP2_HS200_1_2V_SDR; + if (of_find_property(np, "mmc-hs400-1_8v", &len)) + host->caps2 |= MMC_CAP2_HS400_1_8V | MMC_CAP2_HS200_1_8V_SDR; + if (of_find_property(np, "mmc-hs400-1_2v", &len)) + host->caps2 |= MMC_CAP2_HS400_1_2V | MMC_CAP2_HS200_1_2V_SDR; return 0; diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 1ab5f3a0af5b..793c6f7ddb04 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -240,31 +240,62 @@ static int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd) static void mmc_select_card_type(struct mmc_card *card) { struct mmc_host *host = card->host; - u8 card_type = card->ext_csd.raw_card_type & EXT_CSD_CARD_TYPE_MASK; + u8 card_type = card->ext_csd.raw_card_type; u32 caps = host->caps, caps2 = host->caps2; - unsigned int hs_max_dtr = 0; + unsigned int hs_max_dtr = 0, hs200_max_dtr = 0; + unsigned int avail_type = 0; - if (card_type & EXT_CSD_CARD_TYPE_26) + if (caps & MMC_CAP_MMC_HIGHSPEED && + card_type & EXT_CSD_CARD_TYPE_HS_26) { hs_max_dtr = MMC_HIGH_26_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS_26; + } if (caps & MMC_CAP_MMC_HIGHSPEED && - card_type & EXT_CSD_CARD_TYPE_52) + card_type & EXT_CSD_CARD_TYPE_HS_52) { hs_max_dtr = MMC_HIGH_52_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS_52; + } + + if (caps & MMC_CAP_1_8V_DDR && + card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) { + hs_max_dtr = MMC_HIGH_DDR_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_DDR_1_8V; + } - if ((caps & MMC_CAP_1_8V_DDR && - card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) || - (caps & MMC_CAP_1_2V_DDR && - card_type & EXT_CSD_CARD_TYPE_DDR_1_2V)) + if (caps & MMC_CAP_1_2V_DDR && + card_type & EXT_CSD_CARD_TYPE_DDR_1_2V) { hs_max_dtr = MMC_HIGH_DDR_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_DDR_1_2V; + } + + if (caps2 & MMC_CAP2_HS200_1_8V_SDR && + card_type & EXT_CSD_CARD_TYPE_HS200_1_8V) { + hs200_max_dtr = MMC_HS200_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS200_1_8V; + } + + if (caps2 & MMC_CAP2_HS200_1_2V_SDR && + card_type & EXT_CSD_CARD_TYPE_HS200_1_2V) { + hs200_max_dtr = MMC_HS200_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS200_1_2V; + } + + if (caps2 & MMC_CAP2_HS400_1_8V && + card_type & EXT_CSD_CARD_TYPE_HS400_1_8V) { + hs200_max_dtr = MMC_HS200_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS400_1_8V; + } - if ((caps2 & MMC_CAP2_HS200_1_8V_SDR && - card_type & EXT_CSD_CARD_TYPE_SDR_1_8V) || - (caps2 & MMC_CAP2_HS200_1_2V_SDR && - card_type & EXT_CSD_CARD_TYPE_SDR_1_2V)) - hs_max_dtr = MMC_HS200_MAX_DTR; + if (caps2 & MMC_CAP2_HS400_1_2V && + card_type & EXT_CSD_CARD_TYPE_HS400_1_2V) { + hs200_max_dtr = MMC_HS200_MAX_DTR; + avail_type |= EXT_CSD_CARD_TYPE_HS400_1_2V; + } card->ext_csd.hs_max_dtr = hs_max_dtr; - card->ext_csd.card_type = card_type; + card->ext_csd.hs200_max_dtr = hs200_max_dtr; + card->mmc_avail_type = avail_type; } /* @@ -480,6 +511,8 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd) ext_csd[EXT_CSD_PWR_CL_DDR_52_195]; card->ext_csd.raw_pwr_cl_ddr_52_360 = ext_csd[EXT_CSD_PWR_CL_DDR_52_360]; + card->ext_csd.raw_pwr_cl_ddr_200_360 = + ext_csd[EXT_CSD_PWR_CL_DDR_200_360]; } if (card->ext_csd.rev >= 5) { @@ -646,7 +679,10 @@ static int mmc_compare_ext_csds(struct mmc_card *card, unsigned bus_width) (card->ext_csd.raw_pwr_cl_ddr_52_195 == bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_195]) && (card->ext_csd.raw_pwr_cl_ddr_52_360 == - bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_360])); + bw_ext_csd[EXT_CSD_PWR_CL_DDR_52_360]) && + (card->ext_csd.raw_pwr_cl_ddr_200_360 == + bw_ext_csd[EXT_CSD_PWR_CL_DDR_200_360])); + if (err) err = -EINVAL; @@ -694,18 +730,10 @@ static struct attribute *mmc_std_attrs[] = { &dev_attr_rel_sectors.attr, NULL, }; - -static struct attribute_group mmc_std_attr_group = { - .attrs = mmc_std_attrs, -}; - -static const struct attribute_group *mmc_attr_groups[] = { - &mmc_std_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(mmc_std); static struct device_type mmc_type = { - .groups = mmc_attr_groups, + .groups = mmc_std_groups, }; /* @@ -714,17 +742,13 @@ static struct device_type mmc_type = { * extended CSD register, select it by executing the * mmc_switch command. */ -static int mmc_select_powerclass(struct mmc_card *card, - unsigned int bus_width) +static int __mmc_select_powerclass(struct mmc_card *card, + unsigned int bus_width) { - int err = 0; + struct mmc_host *host = card->host; + struct mmc_ext_csd *ext_csd = &card->ext_csd; unsigned int pwrclass_val = 0; - struct mmc_host *host; - - BUG_ON(!card); - - host = card->host; - BUG_ON(!host); + int err = 0; /* Power class selection is supported for versions >= 4.0 */ if (card->csd.mmca_vsn < CSD_SPEC_VER_4) @@ -736,14 +760,14 @@ static int mmc_select_powerclass(struct mmc_card *card, switch (1 << host->ios.vdd) { case MMC_VDD_165_195: - if (host->ios.clock <= 26000000) - pwrclass_val = card->ext_csd.raw_pwr_cl_26_195; - else if (host->ios.clock <= 52000000) + if (host->ios.clock <= MMC_HIGH_26_MAX_DTR) + pwrclass_val = ext_csd->raw_pwr_cl_26_195; + else if (host->ios.clock <= MMC_HIGH_52_MAX_DTR) pwrclass_val = (bus_width <= EXT_CSD_BUS_WIDTH_8) ? - card->ext_csd.raw_pwr_cl_52_195 : - card->ext_csd.raw_pwr_cl_ddr_52_195; - else if (host->ios.clock <= 200000000) - pwrclass_val = card->ext_csd.raw_pwr_cl_200_195; + ext_csd->raw_pwr_cl_52_195 : + ext_csd->raw_pwr_cl_ddr_52_195; + else if (host->ios.clock <= MMC_HS200_MAX_DTR) + pwrclass_val = ext_csd->raw_pwr_cl_200_195; break; case MMC_VDD_27_28: case MMC_VDD_28_29: @@ -754,14 +778,16 @@ static int mmc_select_powerclass(struct mmc_card *card, case MMC_VDD_33_34: case MMC_VDD_34_35: case MMC_VDD_35_36: - if (host->ios.clock <= 26000000) - pwrclass_val = card->ext_csd.raw_pwr_cl_26_360; - else if (host->ios.clock <= 52000000) + if (host->ios.clock <= MMC_HIGH_26_MAX_DTR) + pwrclass_val = ext_csd->raw_pwr_cl_26_360; + else if (host->ios.clock <= MMC_HIGH_52_MAX_DTR) pwrclass_val = (bus_width <= EXT_CSD_BUS_WIDTH_8) ? - card->ext_csd.raw_pwr_cl_52_360 : - card->ext_csd.raw_pwr_cl_ddr_52_360; - else if (host->ios.clock <= 200000000) - pwrclass_val = card->ext_csd.raw_pwr_cl_200_360; + ext_csd->raw_pwr_cl_52_360 : + ext_csd->raw_pwr_cl_ddr_52_360; + else if (host->ios.clock <= MMC_HS200_MAX_DTR) + pwrclass_val = (bus_width == EXT_CSD_DDR_BUS_WIDTH_8) ? + ext_csd->raw_pwr_cl_ddr_200_360 : + ext_csd->raw_pwr_cl_200_360; break; default: pr_warning("%s: Voltage range not supported " @@ -787,40 +813,79 @@ static int mmc_select_powerclass(struct mmc_card *card, return err; } +static int mmc_select_powerclass(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + u32 bus_width, ext_csd_bits; + int err, ddr; + + /* Power class selection is supported for versions >= 4.0 */ + if (card->csd.mmca_vsn < CSD_SPEC_VER_4) + return 0; + + bus_width = host->ios.bus_width; + /* Power class values are defined only for 4/8 bit bus */ + if (bus_width == MMC_BUS_WIDTH_1) + return 0; + + ddr = card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_52; + if (ddr) + ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ? + EXT_CSD_DDR_BUS_WIDTH_8 : EXT_CSD_DDR_BUS_WIDTH_4; + else + ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ? + EXT_CSD_BUS_WIDTH_8 : EXT_CSD_BUS_WIDTH_4; + + err = __mmc_select_powerclass(card, ext_csd_bits); + if (err) + pr_warn("%s: power class selection to bus width %d ddr %d failed\n", + mmc_hostname(host), 1 << bus_width, ddr); + + return err; +} + /* - * Selects the desired buswidth and switch to the HS200 mode - * if bus width set without error + * Set the bus speed for the selected speed mode. */ -static int mmc_select_hs200(struct mmc_card *card) +static void mmc_set_bus_speed(struct mmc_card *card) +{ + unsigned int max_dtr = (unsigned int)-1; + + if ((mmc_card_hs200(card) || mmc_card_hs400(card)) && + max_dtr > card->ext_csd.hs200_max_dtr) + max_dtr = card->ext_csd.hs200_max_dtr; + else if (mmc_card_hs(card) && max_dtr > card->ext_csd.hs_max_dtr) + max_dtr = card->ext_csd.hs_max_dtr; + else if (max_dtr > card->csd.max_dtr) + max_dtr = card->csd.max_dtr; + + mmc_set_clock(card->host, max_dtr); +} + +/* + * Select the bus width amoung 4-bit and 8-bit(SDR). + * If the bus width is changed successfully, return the selected width value. + * Zero is returned instead of error value if the wide width is not supported. + */ +static int mmc_select_bus_width(struct mmc_card *card) { - int idx, err = -EINVAL; - struct mmc_host *host; static unsigned ext_csd_bits[] = { - EXT_CSD_BUS_WIDTH_4, EXT_CSD_BUS_WIDTH_8, + EXT_CSD_BUS_WIDTH_4, }; static unsigned bus_widths[] = { - MMC_BUS_WIDTH_4, MMC_BUS_WIDTH_8, + MMC_BUS_WIDTH_4, }; + struct mmc_host *host = card->host; + unsigned idx, bus_width = 0; + int err = 0; - BUG_ON(!card); - - host = card->host; - - if (card->ext_csd.card_type & EXT_CSD_CARD_TYPE_SDR_1_2V && - host->caps2 & MMC_CAP2_HS200_1_2V_SDR) - err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120); - - if (err && card->ext_csd.card_type & EXT_CSD_CARD_TYPE_SDR_1_8V && - host->caps2 & MMC_CAP2_HS200_1_8V_SDR) - err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180); - - /* If fails try again during next card power cycle */ - if (err) - goto err; + if ((card->csd.mmca_vsn < CSD_SPEC_VER_4) && + !(host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) + return 0; - idx = (host->caps & MMC_CAP_8_BIT_DATA) ? 1 : 0; + idx = (host->caps & MMC_CAP_8_BIT_DATA) ? 0 : 1; /* * Unlike SD, MMC cards dont have a configuration register to notify @@ -828,8 +893,7 @@ static int mmc_select_hs200(struct mmc_card *card) * the supported bus width or compare the ext csd values of current * bus width and ext csd values of 1 bit mode read earlier. */ - for (; idx >= 0; idx--) { - + for (; idx < ARRAY_SIZE(bus_widths); idx++) { /* * Host is capable of 8bit transfer, then switch * the device to work in 8bit transfer mode. If the @@ -844,27 +908,266 @@ static int mmc_select_hs200(struct mmc_card *card) if (err) continue; - mmc_set_bus_width(card->host, bus_widths[idx]); + bus_width = bus_widths[idx]; + mmc_set_bus_width(host, bus_width); + /* + * If controller can't handle bus width test, + * compare ext_csd previously read in 1 bit mode + * against ext_csd at new bus width + */ if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST)) - err = mmc_compare_ext_csds(card, bus_widths[idx]); + err = mmc_compare_ext_csds(card, bus_width); else - err = mmc_bus_test(card, bus_widths[idx]); - if (!err) + err = mmc_bus_test(card, bus_width); + + if (!err) { + err = bus_width; break; + } else { + pr_warn("%s: switch to bus width %d failed\n", + mmc_hostname(host), ext_csd_bits[idx]); + } } - /* switch to HS200 mode if bus width set successfully */ + return err; +} + +/* + * Switch to the high-speed mode + */ +static int mmc_select_hs(struct mmc_card *card) +{ + int err; + + err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, + EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS, + card->ext_csd.generic_cmd6_time, + true, true, true); if (!err) + mmc_set_timing(card->host, MMC_TIMING_MMC_HS); + + return err; +} + +/* + * Activate wide bus and DDR if supported. + */ +static int mmc_select_hs_ddr(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + u32 bus_width, ext_csd_bits; + int err = 0; + + if (!(card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_52)) + return 0; + + bus_width = host->ios.bus_width; + if (bus_width == MMC_BUS_WIDTH_1) + return 0; + + ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ? + EXT_CSD_DDR_BUS_WIDTH_8 : EXT_CSD_DDR_BUS_WIDTH_4; + + err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, + EXT_CSD_BUS_WIDTH, + ext_csd_bits, + card->ext_csd.generic_cmd6_time); + if (err) { + pr_warn("%s: switch to bus width %d ddr failed\n", + mmc_hostname(host), 1 << bus_width); + return err; + } + + /* + * eMMC cards can support 3.3V to 1.2V i/o (vccq) + * signaling. + * + * EXT_CSD_CARD_TYPE_DDR_1_8V means 3.3V or 1.8V vccq. + * + * 1.8V vccq at 3.3V core voltage (vcc) is not required + * in the JEDEC spec for DDR. + * + * Do not force change in vccq since we are obviously + * working and no change to vccq is needed. + * + * WARNING: eMMC rules are NOT the same as SD DDR + */ + if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_DDR_1_2V) { + err = __mmc_set_signal_voltage(host, + MMC_SIGNAL_VOLTAGE_120); + if (err) + return err; + } + + mmc_set_timing(host, MMC_TIMING_MMC_DDR52); + + return err; +} + +static int mmc_select_hs400(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + int err = 0; + + /* + * HS400 mode requires 8-bit bus width + */ + if (!(card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS400 && + host->ios.bus_width == MMC_BUS_WIDTH_8)) + return 0; + + /* + * Before switching to dual data rate operation for HS400, + * it is required to convert from HS200 mode to HS mode. + */ + mmc_set_timing(card->host, MMC_TIMING_MMC_HS); + mmc_set_bus_speed(card); + + err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, + EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS, + card->ext_csd.generic_cmd6_time, + true, true, true); + if (err) { + pr_warn("%s: switch to high-speed from hs200 failed, err:%d\n", + mmc_hostname(host), err); + return err; + } + + err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, + EXT_CSD_BUS_WIDTH, + EXT_CSD_DDR_BUS_WIDTH_8, + card->ext_csd.generic_cmd6_time); + if (err) { + pr_warn("%s: switch to bus width for hs400 failed, err:%d\n", + mmc_hostname(host), err); + return err; + } + + err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, + EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS400, + card->ext_csd.generic_cmd6_time, + true, true, true); + if (err) { + pr_warn("%s: switch to hs400 failed, err:%d\n", + mmc_hostname(host), err); + return err; + } + + mmc_set_timing(host, MMC_TIMING_MMC_HS400); + mmc_set_bus_speed(card); + + return 0; +} + +/* + * For device supporting HS200 mode, the following sequence + * should be done before executing the tuning process. + * 1. set the desired bus width(4-bit or 8-bit, 1-bit is not supported) + * 2. switch to HS200 mode + * 3. set the clock to > 52Mhz and <=200MHz + */ +static int mmc_select_hs200(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + int err = -EINVAL; + + if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V) + err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120); + + if (err && card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_8V) + err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_180); + + /* If fails try again during next card power cycle */ + if (err) + goto err; + + /* + * Set the bus width(4 or 8) with host's support and + * switch to HS200 mode if bus width is set successfully. + */ + err = mmc_select_bus_width(card); + if (!IS_ERR_VALUE(err)) { err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, - EXT_CSD_HS_TIMING, 2, - card->ext_csd.generic_cmd6_time, - true, true, true); + EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS200, + card->ext_csd.generic_cmd6_time, + true, true, true); + if (!err) + mmc_set_timing(host, MMC_TIMING_MMC_HS200); + } err: return err; } /* + * Activate High Speed or HS200 mode if supported. + */ +static int mmc_select_timing(struct mmc_card *card) +{ + int err = 0; + + if ((card->csd.mmca_vsn < CSD_SPEC_VER_4 && + card->ext_csd.hs_max_dtr == 0)) + goto bus_speed; + + if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200) + err = mmc_select_hs200(card); + else if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS) + err = mmc_select_hs(card); + + if (err && err != -EBADMSG) + return err; + + if (err) { + pr_warn("%s: switch to %s failed\n", + mmc_card_hs(card) ? "high-speed" : + (mmc_card_hs200(card) ? "hs200" : ""), + mmc_hostname(card->host)); + err = 0; + } + +bus_speed: + /* + * Set the bus speed to the selected bus timing. + * If timing is not selected, backward compatible is the default. + */ + mmc_set_bus_speed(card); + return err; +} + +/* + * Execute tuning sequence to seek the proper bus operating + * conditions for HS200 and HS400, which sends CMD21 to the device. + */ +static int mmc_hs200_tuning(struct mmc_card *card) +{ + struct mmc_host *host = card->host; + int err = 0; + + /* + * Timing should be adjusted to the HS400 target + * operation frequency for tuning process + */ + if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS400 && + host->ios.bus_width == MMC_BUS_WIDTH_8) + if (host->ops->prepare_hs400_tuning) + host->ops->prepare_hs400_tuning(host, &host->ios); + + if (host->ops->execute_tuning) { + mmc_host_clk_hold(host); + err = host->ops->execute_tuning(host, + MMC_SEND_TUNING_BLOCK_HS200); + mmc_host_clk_release(host); + + if (err) + pr_warn("%s: tuning execution failed\n", + mmc_hostname(host)); + } + + return err; +} + +/* * Handle the detection and initialisation of a card. * * In the case of a resume, "oldcard" will contain the card @@ -874,9 +1177,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, struct mmc_card *oldcard) { struct mmc_card *card; - int err, ddr = 0; + int err; u32 cid[4]; - unsigned int max_dtr; u32 rocr; u8 *ext_csd = NULL; @@ -1068,206 +1370,34 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, } /* - * Activate high speed (if supported) - */ - if (card->ext_csd.hs_max_dtr != 0) { - err = 0; - if (card->ext_csd.hs_max_dtr > 52000000 && - host->caps2 & MMC_CAP2_HS200) - err = mmc_select_hs200(card); - else if (host->caps & MMC_CAP_MMC_HIGHSPEED) - err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, - EXT_CSD_HS_TIMING, 1, - card->ext_csd.generic_cmd6_time, - true, true, true); - - if (err && err != -EBADMSG) - goto free_card; - - if (err) { - pr_warning("%s: switch to highspeed failed\n", - mmc_hostname(card->host)); - err = 0; - } else { - if (card->ext_csd.hs_max_dtr > 52000000 && - host->caps2 & MMC_CAP2_HS200) { - mmc_card_set_hs200(card); - mmc_set_timing(card->host, - MMC_TIMING_MMC_HS200); - } else { - mmc_card_set_highspeed(card); - mmc_set_timing(card->host, MMC_TIMING_MMC_HS); - } - } - } - - /* - * Compute bus speed. - */ - max_dtr = (unsigned int)-1; - - if (mmc_card_highspeed(card) || mmc_card_hs200(card)) { - if (max_dtr > card->ext_csd.hs_max_dtr) - max_dtr = card->ext_csd.hs_max_dtr; - if (mmc_card_highspeed(card) && (max_dtr > 52000000)) - max_dtr = 52000000; - } else if (max_dtr > card->csd.max_dtr) { - max_dtr = card->csd.max_dtr; - } - - mmc_set_clock(host, max_dtr); - - /* - * Indicate DDR mode (if supported). + * Select timing interface */ - if (mmc_card_highspeed(card)) { - if ((card->ext_csd.card_type & EXT_CSD_CARD_TYPE_DDR_1_8V) - && (host->caps & MMC_CAP_1_8V_DDR)) - ddr = MMC_1_8V_DDR_MODE; - else if ((card->ext_csd.card_type & EXT_CSD_CARD_TYPE_DDR_1_2V) - && (host->caps & MMC_CAP_1_2V_DDR)) - ddr = MMC_1_2V_DDR_MODE; - } + err = mmc_select_timing(card); + if (err) + goto free_card; - /* - * Indicate HS200 SDR mode (if supported). - */ if (mmc_card_hs200(card)) { - u32 ext_csd_bits; - u32 bus_width = card->host->ios.bus_width; - - /* - * For devices supporting HS200 mode, the bus width has - * to be set before executing the tuning function. If - * set before tuning, then device will respond with CRC - * errors for responses on CMD line. So for HS200 the - * sequence will be - * 1. set bus width 4bit / 8 bit (1 bit not supported) - * 2. switch to HS200 mode - * 3. set the clock to > 52Mhz <=200MHz and - * 4. execute tuning for HS200 - */ - if ((host->caps2 & MMC_CAP2_HS200) && - card->host->ops->execute_tuning) { - mmc_host_clk_hold(card->host); - err = card->host->ops->execute_tuning(card->host, - MMC_SEND_TUNING_BLOCK_HS200); - mmc_host_clk_release(card->host); - } - if (err) { - pr_warning("%s: tuning execution failed\n", - mmc_hostname(card->host)); + err = mmc_hs200_tuning(card); + if (err) goto err; - } - ext_csd_bits = (bus_width == MMC_BUS_WIDTH_8) ? - EXT_CSD_BUS_WIDTH_8 : EXT_CSD_BUS_WIDTH_4; - err = mmc_select_powerclass(card, ext_csd_bits); + err = mmc_select_hs400(card); if (err) - pr_warning("%s: power class selection to bus width %d" - " failed\n", mmc_hostname(card->host), - 1 << bus_width); + goto err; + } else if (mmc_card_hs(card)) { + /* Select the desired bus width optionally */ + err = mmc_select_bus_width(card); + if (!IS_ERR_VALUE(err)) { + err = mmc_select_hs_ddr(card); + if (err) + goto err; + } } /* - * Activate wide bus and DDR (if supported). + * Choose the power class with selected bus interface */ - if (!mmc_card_hs200(card) && - (card->csd.mmca_vsn >= CSD_SPEC_VER_4) && - (host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) { - static unsigned ext_csd_bits[][2] = { - { EXT_CSD_BUS_WIDTH_8, EXT_CSD_DDR_BUS_WIDTH_8 }, - { EXT_CSD_BUS_WIDTH_4, EXT_CSD_DDR_BUS_WIDTH_4 }, - { EXT_CSD_BUS_WIDTH_1, EXT_CSD_BUS_WIDTH_1 }, - }; - static unsigned bus_widths[] = { - MMC_BUS_WIDTH_8, - MMC_BUS_WIDTH_4, - MMC_BUS_WIDTH_1 - }; - unsigned idx, bus_width = 0; - - if (host->caps & MMC_CAP_8_BIT_DATA) - idx = 0; - else - idx = 1; - for (; idx < ARRAY_SIZE(bus_widths); idx++) { - bus_width = bus_widths[idx]; - if (bus_width == MMC_BUS_WIDTH_1) - ddr = 0; /* no DDR for 1-bit width */ - err = mmc_select_powerclass(card, ext_csd_bits[idx][0]); - if (err) - pr_warning("%s: power class selection to " - "bus width %d failed\n", - mmc_hostname(card->host), - 1 << bus_width); - - err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, - EXT_CSD_BUS_WIDTH, - ext_csd_bits[idx][0], - card->ext_csd.generic_cmd6_time); - if (!err) { - mmc_set_bus_width(card->host, bus_width); - - /* - * If controller can't handle bus width test, - * compare ext_csd previously read in 1 bit mode - * against ext_csd at new bus width - */ - if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST)) - err = mmc_compare_ext_csds(card, - bus_width); - else - err = mmc_bus_test(card, bus_width); - if (!err) - break; - } - } - - if (!err && ddr) { - err = mmc_select_powerclass(card, ext_csd_bits[idx][1]); - if (err) - pr_warning("%s: power class selection to " - "bus width %d ddr %d failed\n", - mmc_hostname(card->host), - 1 << bus_width, ddr); - - err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, - EXT_CSD_BUS_WIDTH, - ext_csd_bits[idx][1], - card->ext_csd.generic_cmd6_time); - } - if (err) { - pr_warning("%s: switch to bus width %d ddr %d " - "failed\n", mmc_hostname(card->host), - 1 << bus_width, ddr); - goto free_card; - } else if (ddr) { - /* - * eMMC cards can support 3.3V to 1.2V i/o (vccq) - * signaling. - * - * EXT_CSD_CARD_TYPE_DDR_1_8V means 3.3V or 1.8V vccq. - * - * 1.8V vccq at 3.3V core voltage (vcc) is not required - * in the JEDEC spec for DDR. - * - * Do not force change in vccq since we are obviously - * working and no change to vccq is needed. - * - * WARNING: eMMC rules are NOT the same as SD DDR - */ - if (ddr == MMC_1_2V_DDR_MODE) { - err = __mmc_set_signal_voltage(host, - MMC_SIGNAL_VOLTAGE_120); - if (err) - goto err; - } - mmc_card_set_ddr_mode(card); - mmc_set_timing(card->host, MMC_TIMING_UHS_DDR50); - mmc_set_bus_width(card->host, bus_width); - } - } + mmc_select_powerclass(card); /* * Enable HPI feature (if supported) @@ -1507,7 +1637,6 @@ static int _mmc_suspend(struct mmc_host *host, bool is_suspend) err = mmc_sleep(host); else if (!mmc_host_is_spi(host)) err = mmc_deselect_cards(host); - host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_200); if (!err) { mmc_power_off(host); @@ -1637,7 +1766,6 @@ static int mmc_power_restore(struct mmc_host *host) { int ret; - host->card->state &= ~(MMC_STATE_HIGHSPEED | MMC_STATE_HIGHSPEED_200); mmc_claim_host(host); ret = mmc_init_card(host, host->card->ocr, host->card); mmc_release_host(host); diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 2dd359d2242f..0c44510bf717 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -707,18 +707,10 @@ static struct attribute *sd_std_attrs[] = { &dev_attr_serial.attr, NULL, }; - -static struct attribute_group sd_std_attr_group = { - .attrs = sd_std_attrs, -}; - -static const struct attribute_group *sd_attr_groups[] = { - &sd_std_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(sd_std); struct device_type sd_type = { - .groups = sd_attr_groups, + .groups = sd_std_groups, }; /* @@ -895,7 +887,7 @@ unsigned mmc_sd_get_max_clock(struct mmc_card *card) { unsigned max_dtr = (unsigned int)-1; - if (mmc_card_highspeed(card)) { + if (mmc_card_hs(card)) { if (max_dtr > card->sw_caps.hs_max_dtr) max_dtr = card->sw_caps.hs_max_dtr; } else if (max_dtr > card->csd.max_dtr) { @@ -905,12 +897,6 @@ unsigned mmc_sd_get_max_clock(struct mmc_card *card) return max_dtr; } -void mmc_sd_go_highspeed(struct mmc_card *card) -{ - mmc_card_set_highspeed(card); - mmc_set_timing(card->host, MMC_TIMING_SD_HS); -} - /* * Handle the detection and initialisation of a card. * @@ -985,16 +971,13 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr, err = mmc_sd_init_uhs_card(card); if (err) goto free_card; - - /* Card is an ultra-high-speed card */ - mmc_card_set_uhs(card); } else { /* * Attempt to change to high-speed (if supported) */ err = mmc_sd_switch_hs(card); if (err > 0) - mmc_sd_go_highspeed(card); + mmc_set_timing(card->host, MMC_TIMING_SD_HS); else if (err) goto free_card; @@ -1089,7 +1072,7 @@ static int _mmc_sd_suspend(struct mmc_host *host) if (!mmc_host_is_spi(host)) err = mmc_deselect_cards(host); - host->card->state &= ~MMC_STATE_HIGHSPEED; + if (!err) { mmc_power_off(host); mmc_card_set_suspended(host->card); @@ -1198,7 +1181,6 @@ static int mmc_sd_power_restore(struct mmc_host *host) { int ret; - host->card->state &= ~MMC_STATE_HIGHSPEED; mmc_claim_host(host); ret = mmc_sd_init_card(host, host->card->ocr, host->card); mmc_release_host(host); diff --git a/drivers/mmc/core/sd.h b/drivers/mmc/core/sd.h index 4b34b24f3f76..aab824a9a7f3 100644 --- a/drivers/mmc/core/sd.h +++ b/drivers/mmc/core/sd.h @@ -12,6 +12,5 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card, bool reinit); unsigned mmc_sd_get_max_clock(struct mmc_card *card); int mmc_sd_switch_hs(struct mmc_card *card); -void mmc_sd_go_highspeed(struct mmc_card *card); #endif diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 4d721c6e2af0..e636d9e99e4a 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -363,7 +363,7 @@ static unsigned mmc_sdio_get_max_clock(struct mmc_card *card) { unsigned max_dtr; - if (mmc_card_highspeed(card)) { + if (mmc_card_hs(card)) { /* * The SDIO specification doesn't mention how * the CIS transfer speed register relates to @@ -733,7 +733,6 @@ try_again: mmc_set_clock(host, card->cis.max_dtr); if (card->cccr.high_speed) { - mmc_card_set_highspeed(card); mmc_set_timing(card->host, MMC_TIMING_SD_HS); } @@ -792,16 +791,13 @@ try_again: err = mmc_sdio_init_uhs_card(card); if (err) goto remove; - - /* Card is an ultra-high-speed card */ - mmc_card_set_uhs(card); } else { /* * Switch to high-speed (if supported). */ err = sdio_enable_hs(card); if (err > 0) - mmc_sd_go_highspeed(card); + mmc_set_timing(card->host, MMC_TIMING_SD_HS); else if (err) goto remove; @@ -943,40 +939,21 @@ static int mmc_sdio_pre_suspend(struct mmc_host *host) */ static int mmc_sdio_suspend(struct mmc_host *host) { - int i, err = 0; - - for (i = 0; i < host->card->sdio_funcs; i++) { - struct sdio_func *func = host->card->sdio_func[i]; - if (func && sdio_func_present(func) && func->dev.driver) { - const struct dev_pm_ops *pmops = func->dev.driver->pm; - err = pmops->suspend(&func->dev); - if (err) - break; - } - } - while (err && --i >= 0) { - struct sdio_func *func = host->card->sdio_func[i]; - if (func && sdio_func_present(func) && func->dev.driver) { - const struct dev_pm_ops *pmops = func->dev.driver->pm; - pmops->resume(&func->dev); - } - } - - if (!err && mmc_card_keep_power(host) && mmc_card_wake_sdio_irq(host)) { + if (mmc_card_keep_power(host) && mmc_card_wake_sdio_irq(host)) { mmc_claim_host(host); sdio_disable_wide(host->card); mmc_release_host(host); } - if (!err && !mmc_card_keep_power(host)) + if (!mmc_card_keep_power(host)) mmc_power_off(host); - return err; + return 0; } static int mmc_sdio_resume(struct mmc_host *host) { - int i, err = 0; + int err = 0; BUG_ON(!host); BUG_ON(!host->card); @@ -1019,24 +996,6 @@ static int mmc_sdio_resume(struct mmc_host *host) wake_up_process(host->sdio_irq_thread); mmc_release_host(host); - /* - * If the card looked to be the same as before suspending, then - * we proceed to resume all card functions. If one of them returns - * an error then we simply return that error to the core and the - * card will be redetected as new. It is the responsibility of - * the function driver to perform further tests with the extra - * knowledge it has of the card to confirm the card is indeed the - * same as before suspending (same MAC address for network cards, - * etc.) and return an error otherwise. - */ - for (i = 0; !err && i < host->card->sdio_funcs; i++) { - struct sdio_func *func = host->card->sdio_func[i]; - if (func && sdio_func_present(func) && func->dev.driver) { - const struct dev_pm_ops *pmops = func->dev.driver->pm; - err = pmops->resume(&func->dev); - } - } - host->pm_flags &= ~MMC_PM_KEEP_POWER; return err; } diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 92d1ba8e8153..4fa8fef9147f 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -197,20 +197,8 @@ static int sdio_bus_remove(struct device *dev) #ifdef CONFIG_PM -#ifdef CONFIG_PM_SLEEP -static int pm_no_operation(struct device *dev) -{ - /* - * Prevent the PM core from calling SDIO device drivers' suspend - * callback routines, which it is not supposed to do, by using this - * empty function as the bus type suspend callaback for SDIO. - */ - return 0; -} -#endif - static const struct dev_pm_ops sdio_bus_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_no_operation, pm_no_operation) + SET_SYSTEM_SLEEP_PM_OPS(pm_generic_suspend, pm_generic_resume) SET_RUNTIME_PM_OPS( pm_generic_runtime_suspend, pm_generic_runtime_resume, diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c index aaa90460ed23..5cc13c8d35bb 100644 --- a/drivers/mmc/core/sdio_irq.c +++ b/drivers/mmc/core/sdio_irq.c @@ -90,6 +90,15 @@ static int process_sdio_pending_irqs(struct mmc_host *host) return ret; } +void sdio_run_irqs(struct mmc_host *host) +{ + mmc_claim_host(host); + host->sdio_irq_pending = true; + process_sdio_pending_irqs(host); + mmc_release_host(host); +} +EXPORT_SYMBOL_GPL(sdio_run_irqs); + static int sdio_irq_thread(void *_host) { struct mmc_host *host = _host; @@ -189,14 +198,20 @@ static int sdio_card_irq_get(struct mmc_card *card) WARN_ON(!host->claimed); if (!host->sdio_irqs++) { - atomic_set(&host->sdio_irq_thread_abort, 0); - host->sdio_irq_thread = - kthread_run(sdio_irq_thread, host, "ksdioirqd/%s", - mmc_hostname(host)); - if (IS_ERR(host->sdio_irq_thread)) { - int err = PTR_ERR(host->sdio_irq_thread); - host->sdio_irqs--; - return err; + if (!(host->caps2 & MMC_CAP2_SDIO_IRQ_NOTHREAD)) { + atomic_set(&host->sdio_irq_thread_abort, 0); + host->sdio_irq_thread = + kthread_run(sdio_irq_thread, host, + "ksdioirqd/%s", mmc_hostname(host)); + if (IS_ERR(host->sdio_irq_thread)) { + int err = PTR_ERR(host->sdio_irq_thread); + host->sdio_irqs--; + return err; + } + } else { + mmc_host_clk_hold(host); + host->ops->enable_sdio_irq(host, 1); + mmc_host_clk_release(host); } } @@ -211,8 +226,14 @@ static int sdio_card_irq_put(struct mmc_card *card) BUG_ON(host->sdio_irqs < 1); if (!--host->sdio_irqs) { - atomic_set(&host->sdio_irq_thread_abort, 1); - kthread_stop(host->sdio_irq_thread); + if (!(host->caps2 & MMC_CAP2_SDIO_IRQ_NOTHREAD)) { + atomic_set(&host->sdio_irq_thread_abort, 1); + kthread_stop(host->sdio_irq_thread); + } else { + mmc_host_clk_hold(host); + host->ops->enable_sdio_irq(host, 0); + mmc_host_clk_release(host); + } } return 0; diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c index f7650b899e3d..5f89cb83d5f0 100644 --- a/drivers/mmc/core/slot-gpio.c +++ b/drivers/mmc/core/slot-gpio.c @@ -32,9 +32,7 @@ static irqreturn_t mmc_gpio_cd_irqt(int irq, void *dev_id) /* Schedule a card detection after a debounce timeout */ struct mmc_host *host = dev_id; - if (host->ops->card_event) - host->ops->card_event(host); - + host->trigger_card_event = true; mmc_detect_change(host, msecs_to_jiffies(200)); return IRQ_HANDLED; diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 779368b683d0..7fee22432e94 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -168,7 +168,7 @@ config MMC_SDHCI_ESDHC_IMX config MMC_SDHCI_DOVE tristate "SDHCI support on Marvell's Dove SoC" - depends on ARCH_DOVE + depends on ARCH_DOVE || MACH_DOVE depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -283,6 +283,15 @@ config MMC_SDHCI_BCM2835 If unsure, say N. +config MMC_MOXART + tristate "MOXART SD/MMC Host Controller support" + depends on ARCH_MOXART && MMC + help + This selects support for the MOXART SD/MMC Host Controller. + MOXA provides one multi-functional card reader which can + be found on some embedded hardware such as UC-7112-LX. + If you have a controller with this interface, say Y here. + config MMC_OMAP tristate "TI OMAP Multimedia Card Interface support" depends on ARCH_OMAP @@ -688,6 +697,12 @@ config MMC_WMT To compile this driver as a module, choose M here: the module will be called wmt-sdmmc. +config MMC_USDHI6ROL0 + tristate "Renesas USDHI6ROL0 SD/SDIO Host Controller support" + help + This selects support for the Renesas USDHI6ROL0 SD/SDIO + Host Controller + config MMC_REALTEK_PCI tristate "Realtek PCI-E SD/MMC Card Interface Driver" depends on MFD_RTSX_PCI diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 61cbc241935b..7f81ddf1dd2c 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -50,7 +50,9 @@ obj-$(CONFIG_MMC_JZ4740) += jz4740_mmc.o obj-$(CONFIG_MMC_VUB300) += vub300.o obj-$(CONFIG_MMC_USHC) += ushc.o obj-$(CONFIG_MMC_WMT) += wmt-sdmmc.o +obj-$(CONFIG_MMC_MOXART) += moxart-mmc.o obj-$(CONFIG_MMC_SUNXI) += sunxi-mmc.o +obj-$(CONFIG_MMC_USDHI6ROL0) += usdhi6rol0.o obj-$(CONFIG_MMC_REALTEK_PCI) += rtsx_pci_sdmmc.o obj-$(CONFIG_MMC_REALTEK_USB) += rtsx_usb_sdmmc.o diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 42706ea0ba85..aece7cafbb97 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -820,16 +820,9 @@ static void atmci_pdc_complete(struct atmel_mci *host) atmci_pdc_cleanup(host); - /* - * If the card was removed, data will be NULL. No point trying - * to send the stop command or waiting for NBUSY in this case. - */ - if (host->data) { - dev_dbg(&host->pdev->dev, - "(%s) set pending xfer complete\n", __func__); - atmci_set_pending(host, EVENT_XFER_COMPLETE); - tasklet_schedule(&host->tasklet); - } + dev_dbg(&host->pdev->dev, "(%s) set pending xfer complete\n", __func__); + atmci_set_pending(host, EVENT_XFER_COMPLETE); + tasklet_schedule(&host->tasklet); } static void atmci_dma_cleanup(struct atmel_mci *host) diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 3423c5ed50c7..0fbc53ac7eae 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -187,7 +187,7 @@ static void dw_mci_exynos_set_ios(struct dw_mci *host, struct mmc_ios *ios) unsigned long actual; u8 div = priv->ciu_div + 1; - if (ios->timing == MMC_TIMING_UHS_DDR50) { + if (ios->timing == MMC_TIMING_MMC_DDR52) { mci_writel(host, CLKSEL, priv->ddr_timing); /* Should be double rate for DDR mode */ if (ios->bus_width == MMC_BUS_WIDTH_8) @@ -386,8 +386,7 @@ static int dw_mci_exynos_execute_tuning(struct dw_mci_slot *slot, u32 opcode, /* Common capabilities of Exynos4/Exynos5 SoC */ static unsigned long exynos_dwmmc_caps[4] = { - MMC_CAP_UHS_DDR50 | MMC_CAP_1_8V_DDR | - MMC_CAP_8_BIT_DATA | MMC_CAP_CMD23, + MMC_CAP_1_8V_DDR | MMC_CAP_8_BIT_DATA | MMC_CAP_CMD23, MMC_CAP_CMD23, MMC_CAP_CMD23, MMC_CAP_CMD23, @@ -426,7 +425,7 @@ static int dw_mci_exynos_probe(struct platform_device *pdev) return dw_mci_pltfm_register(pdev, drv_data); } -const struct dev_pm_ops dw_mci_exynos_pmops = { +static const struct dev_pm_ops dw_mci_exynos_pmops = { SET_SYSTEM_SLEEP_PM_OPS(dw_mci_exynos_suspend, dw_mci_exynos_resume) .resume_noirq = dw_mci_exynos_resume_noirq, .thaw_noirq = dw_mci_exynos_resume_noirq, diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index cced599d5aeb..1ac227c603b7 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -235,12 +235,6 @@ err: } #endif /* defined(CONFIG_DEBUG_FS) */ -static void dw_mci_set_timeout(struct dw_mci *host) -{ - /* timeout (maximum) */ - mci_writel(host, TMOUT, 0xffffffff); -} - static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd) { struct mmc_data *data; @@ -257,9 +251,8 @@ static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd) (cmd->opcode == SD_IO_RW_DIRECT && ((cmd->arg >> 9) & 0x1FFFF) == SDIO_CCCR_ABORT)) cmdr |= SDMMC_CMD_STOP; - else - if (cmd->opcode != MMC_SEND_STATUS && cmd->data) - cmdr |= SDMMC_CMD_PRV_DAT_WAIT; + else if (cmd->opcode != MMC_SEND_STATUS && cmd->data) + cmdr |= SDMMC_CMD_PRV_DAT_WAIT; if (cmd->flags & MMC_RSP_PRESENT) { /* We expect a response, so set this bit */ @@ -850,8 +843,6 @@ static void __dw_mci_start_request(struct dw_mci *host, u32 cmdflags; mrq = slot->mrq; - if (host->pdata->select_slot) - host->pdata->select_slot(slot->id); host->cur_slot = slot; host->mrq = mrq; @@ -864,7 +855,7 @@ static void __dw_mci_start_request(struct dw_mci *host, data = cmd->data; if (data) { - dw_mci_set_timeout(host); + mci_writel(host, TMOUT, 0xFFFFFFFF); mci_writel(host, BYTCNT, data->blksz*data->blocks); mci_writel(host, BLKSIZ, data->blksz); } @@ -962,7 +953,7 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) regs = mci_readl(slot->host, UHS_REG); /* DDR mode set */ - if (ios->timing == MMC_TIMING_UHS_DDR50) + if (ios->timing == MMC_TIMING_MMC_DDR52) regs |= ((0x1 << slot->id) << 16); else regs &= ~((0x1 << slot->id) << 16); @@ -985,17 +976,11 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) switch (ios->power_mode) { case MMC_POWER_UP: set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags); - /* Power up slot */ - if (slot->host->pdata->setpower) - slot->host->pdata->setpower(slot->id, mmc->ocr_avail); regs = mci_readl(slot->host, PWREN); regs |= (1 << slot->id); mci_writel(slot->host, PWREN, regs); break; case MMC_POWER_OFF: - /* Power down slot */ - if (slot->host->pdata->setpower) - slot->host->pdata->setpower(slot->id, 0); regs = mci_readl(slot->host, PWREN); regs &= ~(1 << slot->id); mci_writel(slot->host, PWREN, regs); @@ -1009,15 +994,13 @@ static int dw_mci_get_ro(struct mmc_host *mmc) { int read_only; struct dw_mci_slot *slot = mmc_priv(mmc); - struct dw_mci_board *brd = slot->host->pdata; + int gpio_ro = mmc_gpio_get_ro(mmc); /* Use platform get_ro function, else try on board write protect */ if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) read_only = 0; - else if (brd->get_ro) - read_only = brd->get_ro(slot->id); - else if (gpio_is_valid(slot->wp_gpio)) - read_only = gpio_get_value(slot->wp_gpio); + else if (!IS_ERR_VALUE(gpio_ro)) + read_only = gpio_ro; else read_only = mci_readl(slot->host, WRTPRT) & (1 << slot->id) ? 1 : 0; @@ -1039,8 +1022,6 @@ static int dw_mci_get_cd(struct mmc_host *mmc) /* Use platform get_cd function, else try onboard card detect */ if (brd->quirks & DW_MCI_QUIRK_BROKEN_CARD_DETECTION) present = 1; - else if (brd->get_cd) - present = !brd->get_cd(slot->id); else if (!IS_ERR_VALUE(gpio_cd)) present = gpio_cd; else @@ -1248,7 +1229,7 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data) data->error = -EIO; } - dev_err(host->dev, "data error, status 0x%08x\n", status); + dev_dbg(host->dev, "data error, status 0x%08x\n", status); /* * After an error, there may be data lingering @@ -2045,86 +2026,15 @@ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot) return quirks; } - -/* find out bus-width for a given slot */ -static u32 dw_mci_of_get_bus_wd(struct device *dev, u8 slot) -{ - struct device_node *np = dw_mci_of_find_slot_node(dev, slot); - u32 bus_wd = 1; - - if (!np) - return 1; - - if (of_property_read_u32(np, "bus-width", &bus_wd)) - dev_err(dev, "bus-width property not found, assuming width" - " as 1\n"); - return bus_wd; -} - -/* find the write protect gpio for a given slot; or -1 if none specified */ -static int dw_mci_of_get_wp_gpio(struct device *dev, u8 slot) -{ - struct device_node *np = dw_mci_of_find_slot_node(dev, slot); - int gpio; - - if (!np) - return -EINVAL; - - gpio = of_get_named_gpio(np, "wp-gpios", 0); - - /* Having a missing entry is valid; return silently */ - if (!gpio_is_valid(gpio)) - return -EINVAL; - - if (devm_gpio_request(dev, gpio, "dw-mci-wp")) { - dev_warn(dev, "gpio [%d] request failed\n", gpio); - return -EINVAL; - } - - return gpio; -} - -/* find the cd gpio for a given slot */ -static void dw_mci_of_get_cd_gpio(struct device *dev, u8 slot, - struct mmc_host *mmc) -{ - struct device_node *np = dw_mci_of_find_slot_node(dev, slot); - int gpio; - - if (!np) - return; - - gpio = of_get_named_gpio(np, "cd-gpios", 0); - - /* Having a missing entry is valid; return silently */ - if (!gpio_is_valid(gpio)) - return; - - if (mmc_gpio_request_cd(mmc, gpio, 0)) - dev_warn(dev, "gpio [%d] request failed\n", gpio); -} #else /* CONFIG_OF */ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot) { return 0; } -static u32 dw_mci_of_get_bus_wd(struct device *dev, u8 slot) -{ - return 1; -} static struct device_node *dw_mci_of_find_slot_node(struct device *dev, u8 slot) { return NULL; } -static int dw_mci_of_get_wp_gpio(struct device *dev, u8 slot) -{ - return -EINVAL; -} -static void dw_mci_of_get_cd_gpio(struct device *dev, u8 slot, - struct mmc_host *mmc) -{ - return; -} #endif /* CONFIG_OF */ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) @@ -2134,7 +2044,6 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) const struct dw_mci_drv_data *drv_data = host->drv_data; int ctrl_id, ret; u32 freq[2]; - u8 bus_width; mmc = mmc_alloc_host(sizeof(struct dw_mci_slot), host->dev); if (!mmc) @@ -2158,17 +2067,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) mmc->f_max = freq[1]; } - if (host->pdata->get_ocr) - mmc->ocr_avail = host->pdata->get_ocr(id); - else - mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; - - /* - * Start with slot power disabled, it will be enabled when a card - * is detected. - */ - if (host->pdata->setpower) - host->pdata->setpower(id, 0); + mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; if (host->pdata->caps) mmc->caps = host->pdata->caps; @@ -2189,19 +2088,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) if (host->pdata->caps2) mmc->caps2 = host->pdata->caps2; - if (host->pdata->get_bus_wd) - bus_width = host->pdata->get_bus_wd(slot->id); - else if (host->dev->of_node) - bus_width = dw_mci_of_get_bus_wd(host->dev, slot->id); - else - bus_width = 1; - - switch (bus_width) { - case 8: - mmc->caps |= MMC_CAP_8_BIT_DATA; - case 4: - mmc->caps |= MMC_CAP_4_BIT_DATA; - } + mmc_of_parse(mmc); if (host->pdata->blk_settings) { mmc->max_segs = host->pdata->blk_settings->max_segs; @@ -2226,8 +2113,10 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) #endif /* CONFIG_MMC_DW_IDMAC */ } - slot->wp_gpio = dw_mci_of_get_wp_gpio(host->dev, slot->id); - dw_mci_of_get_cd_gpio(host->dev, slot->id, mmc); + if (dw_mci_get_cd(mmc)) + set_bit(DW_MMC_CARD_PRESENT, &slot->flags); + else + clear_bit(DW_MMC_CARD_PRESENT, &slot->flags); ret = mmc_add_host(mmc); if (ret) @@ -2249,10 +2138,6 @@ err_setup_bus: static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id) { - /* Shutdown detect IRQ */ - if (slot->host->pdata->exit) - slot->host->pdata->exit(id); - /* Debugfs stuff is cleaned up by mmc core */ mmc_remove_host(slot->mmc); slot->host->slot[id] = NULL; @@ -2399,24 +2284,9 @@ static struct dw_mci_board *dw_mci_parse_dt(struct dw_mci *host) return ERR_PTR(ret); } - if (of_find_property(np, "keep-power-in-suspend", NULL)) - pdata->pm_caps |= MMC_PM_KEEP_POWER; - - if (of_find_property(np, "enable-sdio-wakeup", NULL)) - pdata->pm_caps |= MMC_PM_WAKE_SDIO_IRQ; - if (of_find_property(np, "supports-highspeed", NULL)) pdata->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED; - if (of_find_property(np, "caps2-mmc-hs200-1_8v", NULL)) - pdata->caps2 |= MMC_CAP2_HS200_1_8V_SDR; - - if (of_find_property(np, "caps2-mmc-hs200-1_2v", NULL)) - pdata->caps2 |= MMC_CAP2_HS200_1_2V_SDR; - - if (of_get_property(np, "cd-inverted", NULL)) - pdata->caps2 |= MMC_CAP2_CD_ACTIVE_HIGH; - return pdata; } @@ -2442,9 +2312,9 @@ int dw_mci_probe(struct dw_mci *host) } } - if (!host->pdata->select_slot && host->pdata->num_slots > 1) { + if (host->pdata->num_slots > 1) { dev_err(host->dev, - "Platform data must supply select_slot function\n"); + "Platform data must supply num_slots.\n"); return -ENODEV; } @@ -2474,12 +2344,19 @@ int dw_mci_probe(struct dw_mci *host) ret = clk_set_rate(host->ciu_clk, host->pdata->bus_hz); if (ret) dev_warn(host->dev, - "Unable to set bus rate to %ul\n", + "Unable to set bus rate to %uHz\n", host->pdata->bus_hz); } host->bus_hz = clk_get_rate(host->ciu_clk); } + if (!host->bus_hz) { + dev_err(host->dev, + "Platform data must supply bus speed\n"); + ret = -ENODEV; + goto err_clk_ciu; + } + if (drv_data && drv_data->init) { ret = drv_data->init(host); if (ret) { @@ -2516,13 +2393,6 @@ int dw_mci_probe(struct dw_mci *host) } } - if (!host->bus_hz) { - dev_err(host->dev, - "Platform data must supply bus speed\n"); - ret = -ENODEV; - goto err_regulator; - } - host->quirks = host->pdata->quirks; spin_lock_init(&host->lock); @@ -2666,8 +2536,6 @@ err_workqueue: err_dmaunmap: if (host->use_dma && host->dma_ops->exit) host->dma_ops->exit(host); - -err_regulator: if (host->vmmc) regulator_disable(host->vmmc); diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 68349779c396..738fa241d058 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -195,7 +195,6 @@ extern int dw_mci_resume(struct dw_mci *host); * @mmc: The mmc_host representing this slot. * @host: The MMC controller this slot is using. * @quirks: Slot-level quirks (DW_MCI_SLOT_QUIRK_XXX) - * @wp_gpio: If gpio_is_valid() we'll use this to read write protect. * @ctype: Card type for this slot. * @mrq: mmc_request currently being processed or waiting to be * processed, or NULL when the slot is idle. @@ -214,7 +213,6 @@ struct dw_mci_slot { struct dw_mci *host; int quirks; - int wp_gpio; u32 ctype; diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index de2139cf3444..537d6c7a5ae4 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -515,10 +515,13 @@ static irqreturn_t jz_mmc_irq_worker(int irq, void *devid) jz4740_mmc_send_command(host, req->stop); - timeout = jz4740_mmc_poll_irq(host, JZ_MMC_IRQ_PRG_DONE); - if (timeout) { - host->state = JZ4740_MMC_STATE_DONE; - break; + if (mmc_resp_type(req->stop) & MMC_RSP_BUSY) { + timeout = jz4740_mmc_poll_irq(host, + JZ_MMC_IRQ_PRG_DONE); + if (timeout) { + host->state = JZ4740_MMC_STATE_DONE; + break; + } } case JZ4740_MMC_STATE_DONE: break; diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index a084edd37af5..7ad463e9741c 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -301,7 +301,8 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired) if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8) clk |= MCI_ST_8BIT_BUS; - if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50) + if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 || + host->mmc->ios.timing == MMC_TIMING_MMC_DDR52) clk |= MCI_ST_UX500_NEG_EDGE; mmci_write_clkreg(host, clk); @@ -764,7 +765,8 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data) mmci_write_clkreg(host, clk); } - if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50) + if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 || + host->mmc->ios.timing == MMC_TIMING_MMC_DDR52) datactrl |= MCI_ST_DPSM_DDRMODE; /* diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c new file mode 100644 index 000000000000..74924a04026e --- /dev/null +++ b/drivers/mmc/host/moxart-mmc.c @@ -0,0 +1,730 @@ +/* + * MOXA ART MMC host driver. + * + * Copyright (C) 2014 Jonas Jensen + * + * Jonas Jensen <jonas.jensen@gmail.com> + * + * Based on code from + * Moxa Technologies Co., Ltd. <www.moxa.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/blkdev.h> +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> +#include <linux/mmc/host.h> +#include <linux/mmc/sd.h> +#include <linux/sched.h> +#include <linux/io.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/clk.h> +#include <linux/bitops.h> +#include <linux/of_dma.h> +#include <linux/spinlock.h> + +#define REG_COMMAND 0 +#define REG_ARGUMENT 4 +#define REG_RESPONSE0 8 +#define REG_RESPONSE1 12 +#define REG_RESPONSE2 16 +#define REG_RESPONSE3 20 +#define REG_RESPONSE_COMMAND 24 +#define REG_DATA_CONTROL 28 +#define REG_DATA_TIMER 32 +#define REG_DATA_LENGTH 36 +#define REG_STATUS 40 +#define REG_CLEAR 44 +#define REG_INTERRUPT_MASK 48 +#define REG_POWER_CONTROL 52 +#define REG_CLOCK_CONTROL 56 +#define REG_BUS_WIDTH 60 +#define REG_DATA_WINDOW 64 +#define REG_FEATURE 68 +#define REG_REVISION 72 + +/* REG_COMMAND */ +#define CMD_SDC_RESET BIT(10) +#define CMD_EN BIT(9) +#define CMD_APP_CMD BIT(8) +#define CMD_LONG_RSP BIT(7) +#define CMD_NEED_RSP BIT(6) +#define CMD_IDX_MASK 0x3f + +/* REG_RESPONSE_COMMAND */ +#define RSP_CMD_APP BIT(6) +#define RSP_CMD_IDX_MASK 0x3f + +/* REG_DATA_CONTROL */ +#define DCR_DATA_FIFO_RESET BIT(8) +#define DCR_DATA_THRES BIT(7) +#define DCR_DATA_EN BIT(6) +#define DCR_DMA_EN BIT(5) +#define DCR_DATA_WRITE BIT(4) +#define DCR_BLK_SIZE 0x0f + +/* REG_DATA_LENGTH */ +#define DATA_LEN_MASK 0xffffff + +/* REG_STATUS */ +#define WRITE_PROT BIT(12) +#define CARD_DETECT BIT(11) +/* 1-10 below can be sent to either registers, interrupt or clear. */ +#define CARD_CHANGE BIT(10) +#define FIFO_ORUN BIT(9) +#define FIFO_URUN BIT(8) +#define DATA_END BIT(7) +#define CMD_SENT BIT(6) +#define DATA_CRC_OK BIT(5) +#define RSP_CRC_OK BIT(4) +#define DATA_TIMEOUT BIT(3) +#define RSP_TIMEOUT BIT(2) +#define DATA_CRC_FAIL BIT(1) +#define RSP_CRC_FAIL BIT(0) + +#define MASK_RSP (RSP_TIMEOUT | RSP_CRC_FAIL | \ + RSP_CRC_OK | CARD_DETECT | CMD_SENT) + +#define MASK_DATA (DATA_CRC_OK | DATA_END | \ + DATA_CRC_FAIL | DATA_TIMEOUT) + +#define MASK_INTR_PIO (FIFO_URUN | FIFO_ORUN | CARD_CHANGE) + +/* REG_POWER_CONTROL */ +#define SD_POWER_ON BIT(4) +#define SD_POWER_MASK 0x0f + +/* REG_CLOCK_CONTROL */ +#define CLK_HISPD BIT(9) +#define CLK_OFF BIT(8) +#define CLK_SD BIT(7) +#define CLK_DIV_MASK 0x7f + +/* REG_BUS_WIDTH */ +#define BUS_WIDTH_8 BIT(2) +#define BUS_WIDTH_4 BIT(1) +#define BUS_WIDTH_1 BIT(0) + +#define MMC_VDD_360 23 +#define MIN_POWER (MMC_VDD_360 - SD_POWER_MASK) +#define MAX_RETRIES 500000 + +struct moxart_host { + spinlock_t lock; + + void __iomem *base; + + phys_addr_t reg_phys; + + struct dma_chan *dma_chan_tx; + struct dma_chan *dma_chan_rx; + struct dma_async_tx_descriptor *tx_desc; + struct mmc_host *mmc; + struct mmc_request *mrq; + struct scatterlist *cur_sg; + struct completion dma_complete; + struct completion pio_complete; + + u32 num_sg; + u32 data_remain; + u32 data_len; + u32 fifo_width; + u32 timeout; + u32 rate; + + long sysclk; + + bool have_dma; + bool is_removed; +}; + +static inline void moxart_init_sg(struct moxart_host *host, + struct mmc_data *data) +{ + host->cur_sg = data->sg; + host->num_sg = data->sg_len; + host->data_remain = host->cur_sg->length; + + if (host->data_remain > host->data_len) + host->data_remain = host->data_len; +} + +static inline int moxart_next_sg(struct moxart_host *host) +{ + int remain; + struct mmc_data *data = host->mrq->cmd->data; + + host->cur_sg++; + host->num_sg--; + + if (host->num_sg > 0) { + host->data_remain = host->cur_sg->length; + remain = host->data_len - data->bytes_xfered; + if (remain > 0 && remain < host->data_remain) + host->data_remain = remain; + } + + return host->num_sg; +} + +static int moxart_wait_for_status(struct moxart_host *host, + u32 mask, u32 *status) +{ + int ret = -ETIMEDOUT; + u32 i; + + for (i = 0; i < MAX_RETRIES; i++) { + *status = readl(host->base + REG_STATUS); + if (!(*status & mask)) { + udelay(5); + continue; + } + writel(*status & mask, host->base + REG_CLEAR); + ret = 0; + break; + } + + if (ret) + dev_err(mmc_dev(host->mmc), "timed out waiting for status\n"); + + return ret; +} + + +static void moxart_send_command(struct moxart_host *host, + struct mmc_command *cmd) +{ + u32 status, cmdctrl; + + writel(RSP_TIMEOUT | RSP_CRC_OK | + RSP_CRC_FAIL | CMD_SENT, host->base + REG_CLEAR); + writel(cmd->arg, host->base + REG_ARGUMENT); + + cmdctrl = cmd->opcode & CMD_IDX_MASK; + if (cmdctrl == SD_APP_SET_BUS_WIDTH || cmdctrl == SD_APP_OP_COND || + cmdctrl == SD_APP_SEND_SCR || cmdctrl == SD_APP_SD_STATUS || + cmdctrl == SD_APP_SEND_NUM_WR_BLKS) + cmdctrl |= CMD_APP_CMD; + + if (cmd->flags & MMC_RSP_PRESENT) + cmdctrl |= CMD_NEED_RSP; + + if (cmd->flags & MMC_RSP_136) + cmdctrl |= CMD_LONG_RSP; + + writel(cmdctrl | CMD_EN, host->base + REG_COMMAND); + + if (moxart_wait_for_status(host, MASK_RSP, &status) == -ETIMEDOUT) + cmd->error = -ETIMEDOUT; + + if (status & RSP_TIMEOUT) { + cmd->error = -ETIMEDOUT; + return; + } + if (status & RSP_CRC_FAIL) { + cmd->error = -EIO; + return; + } + if (status & RSP_CRC_OK) { + if (cmd->flags & MMC_RSP_136) { + cmd->resp[3] = readl(host->base + REG_RESPONSE0); + cmd->resp[2] = readl(host->base + REG_RESPONSE1); + cmd->resp[1] = readl(host->base + REG_RESPONSE2); + cmd->resp[0] = readl(host->base + REG_RESPONSE3); + } else { + cmd->resp[0] = readl(host->base + REG_RESPONSE0); + } + } +} + +static void moxart_dma_complete(void *param) +{ + struct moxart_host *host = param; + + complete(&host->dma_complete); +} + +static void moxart_transfer_dma(struct mmc_data *data, struct moxart_host *host) +{ + u32 len, dir_data, dir_slave; + unsigned long dma_time; + struct dma_async_tx_descriptor *desc = NULL; + struct dma_chan *dma_chan; + + if (host->data_len == data->bytes_xfered) + return; + + if (data->flags & MMC_DATA_WRITE) { + dma_chan = host->dma_chan_tx; + dir_data = DMA_TO_DEVICE; + dir_slave = DMA_MEM_TO_DEV; + } else { + dma_chan = host->dma_chan_rx; + dir_data = DMA_FROM_DEVICE; + dir_slave = DMA_DEV_TO_MEM; + } + + len = dma_map_sg(dma_chan->device->dev, data->sg, + data->sg_len, dir_data); + + if (len > 0) { + desc = dmaengine_prep_slave_sg(dma_chan, data->sg, + len, dir_slave, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); + } else { + dev_err(mmc_dev(host->mmc), "dma_map_sg returned zero length\n"); + } + + if (desc) { + host->tx_desc = desc; + desc->callback = moxart_dma_complete; + desc->callback_param = host; + dmaengine_submit(desc); + dma_async_issue_pending(dma_chan); + } + + data->bytes_xfered += host->data_remain; + + dma_time = wait_for_completion_interruptible_timeout( + &host->dma_complete, host->timeout); + + dma_unmap_sg(dma_chan->device->dev, + data->sg, data->sg_len, + dir_data); +} + + +static void moxart_transfer_pio(struct moxart_host *host) +{ + struct mmc_data *data = host->mrq->cmd->data; + u32 *sgp, len = 0, remain, status; + + if (host->data_len == data->bytes_xfered) + return; + + sgp = sg_virt(host->cur_sg); + remain = host->data_remain; + + if (data->flags & MMC_DATA_WRITE) { + while (remain > 0) { + if (moxart_wait_for_status(host, FIFO_URUN, &status) + == -ETIMEDOUT) { + data->error = -ETIMEDOUT; + complete(&host->pio_complete); + return; + } + for (len = 0; len < remain && len < host->fifo_width;) { + iowrite32(*sgp, host->base + REG_DATA_WINDOW); + sgp++; + len += 4; + } + remain -= len; + } + + } else { + while (remain > 0) { + if (moxart_wait_for_status(host, FIFO_ORUN, &status) + == -ETIMEDOUT) { + data->error = -ETIMEDOUT; + complete(&host->pio_complete); + return; + } + for (len = 0; len < remain && len < host->fifo_width;) { + /* SCR data must be read in big endian. */ + if (data->mrq->cmd->opcode == SD_APP_SEND_SCR) + *sgp = ioread32be(host->base + + REG_DATA_WINDOW); + else + *sgp = ioread32(host->base + + REG_DATA_WINDOW); + sgp++; + len += 4; + } + remain -= len; + } + } + + data->bytes_xfered += host->data_remain - remain; + host->data_remain = remain; + + if (host->data_len != data->bytes_xfered) + moxart_next_sg(host); + else + complete(&host->pio_complete); +} + +static void moxart_prepare_data(struct moxart_host *host) +{ + struct mmc_data *data = host->mrq->cmd->data; + u32 datactrl; + int blksz_bits; + + if (!data) + return; + + host->data_len = data->blocks * data->blksz; + blksz_bits = ffs(data->blksz) - 1; + BUG_ON(1 << blksz_bits != data->blksz); + + moxart_init_sg(host, data); + + datactrl = DCR_DATA_EN | (blksz_bits & DCR_BLK_SIZE); + + if (data->flags & MMC_DATA_WRITE) + datactrl |= DCR_DATA_WRITE; + + if ((host->data_len > host->fifo_width) && host->have_dma) + datactrl |= DCR_DMA_EN; + + writel(DCR_DATA_FIFO_RESET, host->base + REG_DATA_CONTROL); + writel(MASK_DATA | FIFO_URUN | FIFO_ORUN, host->base + REG_CLEAR); + writel(host->rate, host->base + REG_DATA_TIMER); + writel(host->data_len, host->base + REG_DATA_LENGTH); + writel(datactrl, host->base + REG_DATA_CONTROL); +} + +static void moxart_request(struct mmc_host *mmc, struct mmc_request *mrq) +{ + struct moxart_host *host = mmc_priv(mmc); + unsigned long pio_time, flags; + u32 status; + + spin_lock_irqsave(&host->lock, flags); + + init_completion(&host->dma_complete); + init_completion(&host->pio_complete); + + host->mrq = mrq; + + if (readl(host->base + REG_STATUS) & CARD_DETECT) { + mrq->cmd->error = -ETIMEDOUT; + goto request_done; + } + + moxart_prepare_data(host); + moxart_send_command(host, host->mrq->cmd); + + if (mrq->cmd->data) { + if ((host->data_len > host->fifo_width) && host->have_dma) { + + writel(CARD_CHANGE, host->base + REG_INTERRUPT_MASK); + + spin_unlock_irqrestore(&host->lock, flags); + + moxart_transfer_dma(mrq->cmd->data, host); + + spin_lock_irqsave(&host->lock, flags); + } else { + + writel(MASK_INTR_PIO, host->base + REG_INTERRUPT_MASK); + + spin_unlock_irqrestore(&host->lock, flags); + + /* PIO transfers start from interrupt. */ + pio_time = wait_for_completion_interruptible_timeout( + &host->pio_complete, host->timeout); + + spin_lock_irqsave(&host->lock, flags); + } + + if (host->is_removed) { + dev_err(mmc_dev(host->mmc), "card removed\n"); + mrq->cmd->error = -ETIMEDOUT; + goto request_done; + } + + if (moxart_wait_for_status(host, MASK_DATA, &status) + == -ETIMEDOUT) { + mrq->cmd->data->error = -ETIMEDOUT; + goto request_done; + } + + if (status & DATA_CRC_FAIL) + mrq->cmd->data->error = -ETIMEDOUT; + + if (mrq->cmd->data->stop) + moxart_send_command(host, mrq->cmd->data->stop); + } + +request_done: + spin_unlock_irqrestore(&host->lock, flags); + mmc_request_done(host->mmc, mrq); +} + +static irqreturn_t moxart_irq(int irq, void *devid) +{ + struct moxart_host *host = (struct moxart_host *)devid; + u32 status; + unsigned long flags; + + spin_lock_irqsave(&host->lock, flags); + + status = readl(host->base + REG_STATUS); + if (status & CARD_CHANGE) { + host->is_removed = status & CARD_DETECT; + if (host->is_removed && host->have_dma) { + dmaengine_terminate_all(host->dma_chan_tx); + dmaengine_terminate_all(host->dma_chan_rx); + } + host->mrq = NULL; + writel(MASK_INTR_PIO, host->base + REG_CLEAR); + writel(CARD_CHANGE, host->base + REG_INTERRUPT_MASK); + mmc_detect_change(host->mmc, 0); + } + if (status & (FIFO_ORUN | FIFO_URUN) && host->mrq) + moxart_transfer_pio(host); + + spin_unlock_irqrestore(&host->lock, flags); + + return IRQ_HANDLED; +} + +static void moxart_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct moxart_host *host = mmc_priv(mmc); + unsigned long flags; + u8 power, div; + u32 ctrl; + + spin_lock_irqsave(&host->lock, flags); + + if (ios->clock) { + for (div = 0; div < CLK_DIV_MASK; ++div) { + if (ios->clock >= host->sysclk / (2 * (div + 1))) + break; + } + ctrl = CLK_SD | div; + host->rate = host->sysclk / (2 * (div + 1)); + if (host->rate > host->sysclk) + ctrl |= CLK_HISPD; + writel(ctrl, host->base + REG_CLOCK_CONTROL); + } + + if (ios->power_mode == MMC_POWER_OFF) { + writel(readl(host->base + REG_POWER_CONTROL) & ~SD_POWER_ON, + host->base + REG_POWER_CONTROL); + } else { + if (ios->vdd < MIN_POWER) + power = 0; + else + power = ios->vdd - MIN_POWER; + + writel(SD_POWER_ON | (u32) power, + host->base + REG_POWER_CONTROL); + } + + switch (ios->bus_width) { + case MMC_BUS_WIDTH_4: + writel(BUS_WIDTH_4, host->base + REG_BUS_WIDTH); + break; + case MMC_BUS_WIDTH_8: + writel(BUS_WIDTH_8, host->base + REG_BUS_WIDTH); + break; + default: + writel(BUS_WIDTH_1, host->base + REG_BUS_WIDTH); + break; + } + + spin_unlock_irqrestore(&host->lock, flags); +} + + +static int moxart_get_ro(struct mmc_host *mmc) +{ + struct moxart_host *host = mmc_priv(mmc); + + return !!(readl(host->base + REG_STATUS) & WRITE_PROT); +} + +static struct mmc_host_ops moxart_ops = { + .request = moxart_request, + .set_ios = moxart_set_ios, + .get_ro = moxart_get_ro, +}; + +static int moxart_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *node = dev->of_node; + struct resource res_mmc; + struct mmc_host *mmc; + struct moxart_host *host = NULL; + struct dma_slave_config cfg; + struct clk *clk; + void __iomem *reg_mmc; + dma_cap_mask_t mask; + int irq, ret; + u32 i; + + mmc = mmc_alloc_host(sizeof(struct moxart_host), dev); + if (!mmc) { + dev_err(dev, "mmc_alloc_host failed\n"); + ret = -ENOMEM; + goto out; + } + + ret = of_address_to_resource(node, 0, &res_mmc); + if (ret) { + dev_err(dev, "of_address_to_resource failed\n"); + goto out; + } + + irq = irq_of_parse_and_map(node, 0); + if (irq <= 0) { + dev_err(dev, "irq_of_parse_and_map failed\n"); + ret = -EINVAL; + goto out; + } + + clk = of_clk_get(node, 0); + if (IS_ERR(clk)) { + dev_err(dev, "of_clk_get failed\n"); + ret = PTR_ERR(clk); + goto out; + } + + reg_mmc = devm_ioremap_resource(dev, &res_mmc); + if (IS_ERR(reg_mmc)) { + ret = PTR_ERR(reg_mmc); + goto out; + } + + mmc_of_parse(mmc); + + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + + host = mmc_priv(mmc); + host->mmc = mmc; + host->base = reg_mmc; + host->reg_phys = res_mmc.start; + host->timeout = msecs_to_jiffies(1000); + host->sysclk = clk_get_rate(clk); + host->fifo_width = readl(host->base + REG_FEATURE) << 2; + host->dma_chan_tx = of_dma_request_slave_channel(node, "tx"); + host->dma_chan_rx = of_dma_request_slave_channel(node, "rx"); + + spin_lock_init(&host->lock); + + mmc->ops = &moxart_ops; + mmc->f_max = DIV_ROUND_CLOSEST(host->sysclk, 2); + mmc->f_min = DIV_ROUND_CLOSEST(host->sysclk, CLK_DIV_MASK * 2); + mmc->ocr_avail = 0xffff00; /* Support 2.0v - 3.6v power. */ + + if (IS_ERR(host->dma_chan_tx) || IS_ERR(host->dma_chan_rx)) { + dev_dbg(dev, "PIO mode transfer enabled\n"); + host->have_dma = false; + } else { + dev_dbg(dev, "DMA channels found (%p,%p)\n", + host->dma_chan_tx, host->dma_chan_rx); + host->have_dma = true; + + cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + + cfg.direction = DMA_MEM_TO_DEV; + cfg.src_addr = 0; + cfg.dst_addr = host->reg_phys + REG_DATA_WINDOW; + dmaengine_slave_config(host->dma_chan_tx, &cfg); + + cfg.direction = DMA_DEV_TO_MEM; + cfg.src_addr = host->reg_phys + REG_DATA_WINDOW; + cfg.dst_addr = 0; + dmaengine_slave_config(host->dma_chan_rx, &cfg); + } + + switch ((readl(host->base + REG_BUS_WIDTH) >> 3) & 3) { + case 1: + mmc->caps |= MMC_CAP_4_BIT_DATA; + break; + case 2: + mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA; + break; + default: + break; + } + + writel(0, host->base + REG_INTERRUPT_MASK); + + writel(CMD_SDC_RESET, host->base + REG_COMMAND); + for (i = 0; i < MAX_RETRIES; i++) { + if (!(readl(host->base + REG_COMMAND) & CMD_SDC_RESET)) + break; + udelay(5); + } + + ret = devm_request_irq(dev, irq, moxart_irq, 0, "moxart-mmc", host); + if (ret) + goto out; + + dev_set_drvdata(dev, mmc); + mmc_add_host(mmc); + + dev_dbg(dev, "IRQ=%d, FIFO is %d bytes\n", irq, host->fifo_width); + + return 0; + +out: + if (mmc) + mmc_free_host(mmc); + return ret; +} + +static int moxart_remove(struct platform_device *pdev) +{ + struct mmc_host *mmc = dev_get_drvdata(&pdev->dev); + struct moxart_host *host = mmc_priv(mmc); + + dev_set_drvdata(&pdev->dev, NULL); + + if (mmc) { + if (!IS_ERR(host->dma_chan_tx)) + dma_release_channel(host->dma_chan_tx); + if (!IS_ERR(host->dma_chan_rx)) + dma_release_channel(host->dma_chan_rx); + mmc_remove_host(mmc); + mmc_free_host(mmc); + + writel(0, host->base + REG_INTERRUPT_MASK); + writel(0, host->base + REG_POWER_CONTROL); + writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, + host->base + REG_CLOCK_CONTROL); + } + + kfree(host); + + return 0; +} + +static const struct of_device_id moxart_mmc_match[] = { + { .compatible = "moxa,moxart-mmc" }, + { .compatible = "faraday,ftsdc010" }, + { } +}; + +static struct platform_driver moxart_mmc_driver = { + .probe = moxart_probe, + .remove = moxart_remove, + .driver = { + .name = "mmc-moxart", + .owner = THIS_MODULE, + .of_match_table = moxart_mmc_match, + }, +}; +module_platform_driver(moxart_mmc_driver); + +MODULE_ALIAS("platform:mmc-moxart"); +MODULE_DESCRIPTION("MOXA ART MMC driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jonas Jensen <jonas.jensen@gmail.com>"); diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 45aa2206741d..9377284f8544 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -354,6 +354,20 @@ static irqreturn_t mvsd_irq(int irq, void *dev) intr_status, mvsd_read(MVSD_NOR_INTR_EN), mvsd_read(MVSD_HW_STATE)); + /* + * It looks like, SDIO IP can issue one late, spurious irq + * although all irqs should be disabled. To work around this, + * bail out early, if we didn't expect any irqs to occur. + */ + if (!mvsd_read(MVSD_NOR_INTR_EN) && !mvsd_read(MVSD_ERR_INTR_EN)) { + dev_dbg(host->dev, "spurious irq detected intr 0x%04x intr_en 0x%04x erri 0x%04x erri_en 0x%04x\n", + mvsd_read(MVSD_NOR_INTR_STATUS), + mvsd_read(MVSD_NOR_INTR_EN), + mvsd_read(MVSD_ERR_INTR_STATUS), + mvsd_read(MVSD_ERR_INTR_EN)); + return IRQ_HANDLED; + } + spin_lock(&host->lock); /* PIO handling, if needed. Messy business... */ @@ -801,10 +815,10 @@ static int mvsd_probe(struct platform_device *pdev) goto out; if (!(mmc->caps & MMC_CAP_NEEDS_POLL)) - dev_notice(&pdev->dev, "using GPIO for card detection\n"); + dev_dbg(&pdev->dev, "using GPIO for card detection\n"); else - dev_notice(&pdev->dev, - "lacking card detect (fall back to polling)\n"); + dev_dbg(&pdev->dev, "lacking card detect (fall back to polling)\n"); + return 0; out: diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index f7199c83f5cf..ed1cb93c3784 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -124,9 +124,8 @@ enum mxcmci_type { struct mxcmci_host { struct mmc_host *mmc; - struct resource *res; void __iomem *base; - int irq; + dma_addr_t phys_base; int detect_irq; struct dma_chan *dma; struct dma_async_tx_descriptor *desc; @@ -154,8 +153,6 @@ struct mxcmci_host { struct work_struct datawork; spinlock_t lock; - struct regulator *vcc; - int burstlen; int dmareq; struct dma_slave_config dma_slave_config; @@ -241,37 +238,15 @@ static inline void mxcmci_writew(struct mxcmci_host *host, u16 val, int reg) static void mxcmci_set_clk_rate(struct mxcmci_host *host, unsigned int clk_ios); -static inline void mxcmci_init_ocr(struct mxcmci_host *host) -{ - host->vcc = regulator_get(mmc_dev(host->mmc), "vmmc"); - - if (IS_ERR(host->vcc)) { - host->vcc = NULL; - } else { - host->mmc->ocr_avail = mmc_regulator_get_ocrmask(host->vcc); - if (host->pdata && host->pdata->ocr_avail) - dev_warn(mmc_dev(host->mmc), - "pdata->ocr_avail will not be used\n"); - } - - if (host->vcc == NULL) { - /* fall-back to platform data */ - if (host->pdata && host->pdata->ocr_avail) - host->mmc->ocr_avail = host->pdata->ocr_avail; - else - host->mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; - } -} - -static inline void mxcmci_set_power(struct mxcmci_host *host, - unsigned char power_mode, - unsigned int vdd) +static void mxcmci_set_power(struct mxcmci_host *host, unsigned int vdd) { - if (host->vcc) { - if (power_mode == MMC_POWER_UP) - mmc_regulator_set_ocr(host->mmc, host->vcc, vdd); - else if (power_mode == MMC_POWER_OFF) - mmc_regulator_set_ocr(host->mmc, host->vcc, 0); + if (!IS_ERR(host->mmc->supply.vmmc)) { + if (host->power_mode == MMC_POWER_UP) + mmc_regulator_set_ocr(host->mmc, + host->mmc->supply.vmmc, vdd); + else if (host->power_mode == MMC_POWER_OFF) + mmc_regulator_set_ocr(host->mmc, + host->mmc->supply.vmmc, 0); } if (host->pdata && host->pdata->setpower) @@ -299,7 +274,6 @@ static void mxcmci_softreset(struct mxcmci_host *host) mxcmci_writew(host, 0xff, MMC_REG_RES_TO); } -static int mxcmci_setup_dma(struct mmc_host *mmc); #if IS_ENABLED(CONFIG_PPC_MPC512x) static inline void buffer_swap32(u32 *buf, int len) @@ -868,8 +842,8 @@ static int mxcmci_setup_dma(struct mmc_host *mmc) struct mxcmci_host *host = mmc_priv(mmc); struct dma_slave_config *config = &host->dma_slave_config; - config->dst_addr = host->res->start + MMC_REG_BUFFER_ACCESS; - config->src_addr = host->res->start + MMC_REG_BUFFER_ACCESS; + config->dst_addr = host->phys_base + MMC_REG_BUFFER_ACCESS; + config->src_addr = host->phys_base + MMC_REG_BUFFER_ACCESS; config->dst_addr_width = 4; config->src_addr_width = 4; config->dst_maxburst = host->burstlen; @@ -911,8 +885,8 @@ static void mxcmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->cmdat &= ~CMD_DAT_CONT_BUS_WIDTH_4; if (host->power_mode != ios->power_mode) { - mxcmci_set_power(host, ios->power_mode, ios->vdd); host->power_mode = ios->power_mode; + mxcmci_set_power(host, ios->vdd); if (ios->power_mode == MMC_POWER_ON) host->cmdat |= CMD_DAT_CONT_INIT; @@ -1040,8 +1014,8 @@ static const struct mmc_host_ops mxcmci_ops = { static int mxcmci_probe(struct platform_device *pdev) { struct mmc_host *mmc; - struct mxcmci_host *host = NULL; - struct resource *iores, *r; + struct mxcmci_host *host; + struct resource *res; int ret = 0, irq; bool dat3_card_detect = false; dma_cap_mask_t mask; @@ -1052,21 +1026,25 @@ static int mxcmci_probe(struct platform_device *pdev) of_id = of_match_device(mxcmci_of_match, &pdev->dev); - iores = platform_get_resource(pdev, IORESOURCE_MEM, 0); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); irq = platform_get_irq(pdev, 0); - if (!iores || irq < 0) + if (irq < 0) return -EINVAL; - r = request_mem_region(iores->start, resource_size(iores), pdev->name); - if (!r) - return -EBUSY; + mmc = mmc_alloc_host(sizeof(*host), &pdev->dev); + if (!mmc) + return -ENOMEM; + + host = mmc_priv(mmc); - mmc = mmc_alloc_host(sizeof(struct mxcmci_host), &pdev->dev); - if (!mmc) { - ret = -ENOMEM; - goto out_release_mem; + host->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(host->base)) { + ret = PTR_ERR(host->base); + goto out_free; } + host->phys_base = res->start; + ret = mmc_of_parse(mmc); if (ret) goto out_free; @@ -1084,13 +1062,6 @@ static int mxcmci_probe(struct platform_device *pdev) mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count; mmc->max_seg_size = mmc->max_req_size; - host = mmc_priv(mmc); - host->base = ioremap(r->start, resource_size(r)); - if (!host->base) { - ret = -ENOMEM; - goto out_free; - } - if (of_id) { const struct platform_device_id *id_entry = of_id->data; host->devtype = id_entry->driver_data; @@ -1112,7 +1083,14 @@ static int mxcmci_probe(struct platform_device *pdev) && !of_property_read_bool(pdev->dev.of_node, "cd-gpios")) dat3_card_detect = true; - mxcmci_init_ocr(host); + ret = mmc_regulator_get_supply(mmc); + if (ret) { + if (pdata && ret != -EPROBE_DEFER) + mmc->ocr_avail = pdata->ocr_avail ? : + MMC_VDD_32_33 | MMC_VDD_33_34; + else + goto out_free; + } if (dat3_card_detect) host->default_irq_mask = @@ -1120,19 +1098,16 @@ static int mxcmci_probe(struct platform_device *pdev) else host->default_irq_mask = 0; - host->res = r; - host->irq = irq; - host->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); if (IS_ERR(host->clk_ipg)) { ret = PTR_ERR(host->clk_ipg); - goto out_iounmap; + goto out_free; } host->clk_per = devm_clk_get(&pdev->dev, "per"); if (IS_ERR(host->clk_per)) { ret = PTR_ERR(host->clk_per); - goto out_iounmap; + goto out_free; } clk_prepare_enable(host->clk_per); @@ -1159,9 +1134,9 @@ static int mxcmci_probe(struct platform_device *pdev) if (!host->pdata) { host->dma = dma_request_slave_channel(&pdev->dev, "rx-tx"); } else { - r = platform_get_resource(pdev, IORESOURCE_DMA, 0); - if (r) { - host->dmareq = r->start; + res = platform_get_resource(pdev, IORESOURCE_DMA, 0); + if (res) { + host->dmareq = res->start; host->dma_data.peripheral_type = IMX_DMATYPE_SDHC; host->dma_data.priority = DMA_PRIO_LOW; host->dma_data.dma_request = host->dmareq; @@ -1178,7 +1153,8 @@ static int mxcmci_probe(struct platform_device *pdev) INIT_WORK(&host->datawork, mxcmci_datawork); - ret = request_irq(host->irq, mxcmci_irq, 0, DRIVER_NAME, host); + ret = devm_request_irq(&pdev->dev, irq, mxcmci_irq, 0, + dev_name(&pdev->dev), host); if (ret) goto out_free_dma; @@ -1188,7 +1164,7 @@ static int mxcmci_probe(struct platform_device *pdev) ret = host->pdata->init(&pdev->dev, mxcmci_detect_irq, host->mmc); if (ret) - goto out_free_irq; + goto out_free_dma; } init_timer(&host->watchdog); @@ -1199,20 +1175,17 @@ static int mxcmci_probe(struct platform_device *pdev) return 0; -out_free_irq: - free_irq(host->irq, host); out_free_dma: if (host->dma) dma_release_channel(host->dma); + out_clk_put: clk_disable_unprepare(host->clk_per); clk_disable_unprepare(host->clk_ipg); -out_iounmap: - iounmap(host->base); + out_free: mmc_free_host(mmc); -out_release_mem: - release_mem_region(iores->start, resource_size(iores)); + return ret; } @@ -1223,30 +1196,21 @@ static int mxcmci_remove(struct platform_device *pdev) mmc_remove_host(mmc); - if (host->vcc) - regulator_put(host->vcc); - if (host->pdata && host->pdata->exit) host->pdata->exit(&pdev->dev, mmc); - free_irq(host->irq, host); - iounmap(host->base); - if (host->dma) dma_release_channel(host->dma); clk_disable_unprepare(host->clk_per); clk_disable_unprepare(host->clk_ipg); - release_mem_region(host->res->start, resource_size(host->res)); - mmc_free_host(mmc); return 0; } -#ifdef CONFIG_PM -static int mxcmci_suspend(struct device *dev) +static int __maybe_unused mxcmci_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct mxcmci_host *host = mmc_priv(mmc); @@ -1256,7 +1220,7 @@ static int mxcmci_suspend(struct device *dev) return 0; } -static int mxcmci_resume(struct device *dev) +static int __maybe_unused mxcmci_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct mxcmci_host *host = mmc_priv(mmc); @@ -1266,11 +1230,7 @@ static int mxcmci_resume(struct device *dev) return 0; } -static const struct dev_pm_ops mxcmci_pm_ops = { - .suspend = mxcmci_suspend, - .resume = mxcmci_resume, -}; -#endif +static SIMPLE_DEV_PM_OPS(mxcmci_pm_ops, mxcmci_suspend, mxcmci_resume); static struct platform_driver mxcmci_driver = { .probe = mxcmci_probe, @@ -1279,9 +1239,7 @@ static struct platform_driver mxcmci_driver = { .driver = { .name = DRIVER_NAME, .owner = THIS_MODULE, -#ifdef CONFIG_PM .pm = &mxcmci_pm_ops, -#endif .of_match_table = mxcmci_of_match, } }; diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index 073e871a0fc8..babfea03ba8a 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -70,6 +70,7 @@ struct mxs_mmc_host { unsigned char bus_width; spinlock_t lock; int sdio_irq_en; + bool broken_cd; }; static int mxs_mmc_get_cd(struct mmc_host *mmc) @@ -78,6 +79,9 @@ static int mxs_mmc_get_cd(struct mmc_host *mmc) struct mxs_ssp *ssp = &host->ssp; int present, ret; + if (host->broken_cd) + return -ENOSYS; + ret = mmc_gpio_get_cd(mmc); if (ret >= 0) return ret; @@ -568,6 +572,7 @@ static int mxs_mmc_probe(struct platform_device *pdev) { const struct of_device_id *of_id = of_match_device(mxs_mmc_dt_ids, &pdev->dev); + struct device_node *np = pdev->dev.of_node; struct mxs_mmc_host *host; struct mmc_host *mmc; struct resource *iores; @@ -634,6 +639,8 @@ static int mxs_mmc_probe(struct platform_device *pdev) mmc->caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_NEEDS_POLL; + host->broken_cd = of_property_read_bool(np, "broken-cd"); + mmc->f_min = 400000; mmc->f_max = 288000000; diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 5c2e58b29305..81974ecdfcbc 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -177,7 +177,7 @@ static void mmc_omap_fclk_offdelay(struct mmc_omap_slot *slot) unsigned long tick_ns; if (slot != NULL && slot->host->fclk_enabled && slot->fclk_freq > 0) { - tick_ns = (1000000000 + slot->fclk_freq - 1) / slot->fclk_freq; + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, slot->fclk_freq); ndelay(8 * tick_ns); } } @@ -435,7 +435,7 @@ static void mmc_omap_send_stop_work(struct work_struct *work) struct mmc_data *data = host->stop_data; unsigned long tick_ns; - tick_ns = (1000000000 + slot->fclk_freq - 1)/slot->fclk_freq; + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, slot->fclk_freq); ndelay(8*tick_ns); mmc_omap_start_command(host, data->stop); @@ -477,7 +477,7 @@ mmc_omap_send_abort(struct mmc_omap_host *host, int maxloops) u16 stat = 0; /* Sending abort takes 80 clocks. Have some extra and round up */ - timeout = (120*1000000 + slot->fclk_freq - 1)/slot->fclk_freq; + timeout = DIV_ROUND_UP(120 * USEC_PER_SEC, slot->fclk_freq); restarts = 0; while (restarts < maxloops) { OMAP_MMC_WRITE(host, STAT, 0xFFFF); @@ -677,8 +677,8 @@ mmc_omap_xfer_data(struct mmc_omap_host *host, int write) if (n > host->buffer_bytes_left) n = host->buffer_bytes_left; - nwords = n / 2; - nwords += n & 1; /* handle odd number of bytes to transfer */ + /* Round up to handle odd number of bytes to transfer */ + nwords = DIV_ROUND_UP(n, 2); host->buffer_bytes_left -= n; host->total_bytes_left -= n; diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index e91ee21549d0..6b7b75585926 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -31,7 +31,7 @@ #include <linux/of.h> #include <linux/of_gpio.h> #include <linux/of_device.h> -#include <linux/omap-dma.h> +#include <linux/omap-dmaengine.h> #include <linux/mmc/host.h> #include <linux/mmc/core.h> #include <linux/mmc/mmc.h> @@ -582,7 +582,7 @@ static void omap_hsmmc_set_clock(struct omap_hsmmc_host *host) * - MMC/SD clock coming out of controller > 25MHz */ if ((mmc_slot(host).features & HSMMC_HAS_HSPE_SUPPORT) && - (ios->timing != MMC_TIMING_UHS_DDR50) && + (ios->timing != MMC_TIMING_MMC_DDR52) && ((OMAP_HSMMC_READ(host->base, CAPA) & HSS) == HSS)) { regval = OMAP_HSMMC_READ(host->base, HCTL); if (clkdiv && (clk_get_rate(host->fclk)/clkdiv) > 25000000) @@ -602,7 +602,7 @@ static void omap_hsmmc_set_bus_width(struct omap_hsmmc_host *host) u32 con; con = OMAP_HSMMC_READ(host->base, CON); - if (ios->timing == MMC_TIMING_UHS_DDR50) + if (ios->timing == MMC_TIMING_MMC_DDR52) con |= DDR; /* configure in DDR mode */ else con &= ~DDR; @@ -920,16 +920,17 @@ omap_hsmmc_xfer_done(struct omap_hsmmc_host *host, struct mmc_data *data) static void omap_hsmmc_cmd_done(struct omap_hsmmc_host *host, struct mmc_command *cmd) { - host->cmd = NULL; - if (host->mrq->sbc && (host->cmd == host->mrq->sbc) && !host->mrq->sbc->error && !(host->flags & AUTO_CMD23)) { + host->cmd = NULL; omap_hsmmc_start_dma_transfer(host); omap_hsmmc_start_command(host, host->mrq->cmd, host->mrq->data); return; } + host->cmd = NULL; + if (cmd->flags & MMC_RSP_PRESENT) { if (cmd->flags & MMC_RSP_136) { /* response type 2 */ @@ -1851,6 +1852,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) unsigned tx_req, rx_req; struct pinctrl *pinctrl; const struct omap_mmc_of_data *data; + void __iomem *base; match = of_match_device(of_match_ptr(omap_mmc_of_match), &pdev->dev); if (match) { @@ -1881,9 +1883,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev) if (res == NULL || irq < 0) return -ENXIO; - res = request_mem_region(res->start, resource_size(res), pdev->name); - if (res == NULL) - return -EBUSY; + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); ret = omap_hsmmc_gpio_init(pdata); if (ret) @@ -1904,7 +1906,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) host->irq = irq; host->slot_id = 0; host->mapbase = res->start + pdata->reg_offset; - host->base = ioremap(host->mapbase, SZ_4K); + host->base = base + pdata->reg_offset; host->power_mode = MMC_POWER_OFF; host->next_data.cookie = 1; host->pbias_enabled = 0; @@ -1922,7 +1924,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) spin_lock_init(&host->irq_lock); - host->fclk = clk_get(&pdev->dev, "fck"); + host->fclk = devm_clk_get(&pdev->dev, "fck"); if (IS_ERR(host->fclk)) { ret = PTR_ERR(host->fclk); host->fclk = NULL; @@ -1941,7 +1943,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) omap_hsmmc_context_save(host); - host->dbclk = clk_get(&pdev->dev, "mmchsdb_fck"); + host->dbclk = devm_clk_get(&pdev->dev, "mmchsdb_fck"); /* * MMC can still work without debounce clock. */ @@ -1949,7 +1951,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev) host->dbclk = NULL; } else if (clk_prepare_enable(host->dbclk) != 0) { dev_warn(mmc_dev(host->mmc), "Failed to enable debounce clk\n"); - clk_put(host->dbclk); host->dbclk = NULL; } @@ -2018,7 +2019,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) } /* Request IRQ for MMC operations */ - ret = request_irq(host->irq, omap_hsmmc_irq, 0, + ret = devm_request_irq(&pdev->dev, host->irq, omap_hsmmc_irq, 0, mmc_hostname(mmc), host); if (ret) { dev_err(mmc_dev(host->mmc), "Unable to grab HSMMC IRQ\n"); @@ -2029,7 +2030,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) if (pdata->init(&pdev->dev) != 0) { dev_err(mmc_dev(host->mmc), "Unable to configure MMC IRQs\n"); - goto err_irq_cd_init; + goto err_irq; } } @@ -2044,9 +2045,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev) /* Request IRQ for card detect */ if ((mmc_slot(host).card_detect_irq)) { - ret = request_threaded_irq(mmc_slot(host).card_detect_irq, - NULL, - omap_hsmmc_detect, + ret = devm_request_threaded_irq(&pdev->dev, + mmc_slot(host).card_detect_irq, + NULL, omap_hsmmc_detect, IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING | IRQF_ONESHOT, mmc_hostname(mmc), host); if (ret) { @@ -2089,15 +2090,12 @@ static int omap_hsmmc_probe(struct platform_device *pdev) err_slot_name: mmc_remove_host(mmc); - free_irq(mmc_slot(host).card_detect_irq, host); err_irq_cd: if (host->use_reg) omap_hsmmc_reg_put(host); err_reg: if (host->pdata->cleanup) host->pdata->cleanup(&pdev->dev); -err_irq_cd_init: - free_irq(host->irq, host); err_irq: if (host->tx_chan) dma_release_channel(host->tx_chan); @@ -2105,27 +2103,19 @@ err_irq: dma_release_channel(host->rx_chan); pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); - clk_put(host->fclk); - if (host->dbclk) { + if (host->dbclk) clk_disable_unprepare(host->dbclk); - clk_put(host->dbclk); - } err1: - iounmap(host->base); mmc_free_host(mmc); err_alloc: omap_hsmmc_gpio_free(pdata); err: - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res) - release_mem_region(res->start, resource_size(res)); return ret; } static int omap_hsmmc_remove(struct platform_device *pdev) { struct omap_hsmmc_host *host = platform_get_drvdata(pdev); - struct resource *res; pm_runtime_get_sync(host->dev); mmc_remove_host(host->mmc); @@ -2133,9 +2123,6 @@ static int omap_hsmmc_remove(struct platform_device *pdev) omap_hsmmc_reg_put(host); if (host->pdata->cleanup) host->pdata->cleanup(&pdev->dev); - free_irq(host->irq, host); - if (mmc_slot(host).card_detect_irq) - free_irq(mmc_slot(host).card_detect_irq, host); if (host->tx_chan) dma_release_channel(host->tx_chan); @@ -2144,20 +2131,12 @@ static int omap_hsmmc_remove(struct platform_device *pdev) pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); - clk_put(host->fclk); - if (host->dbclk) { + if (host->dbclk) clk_disable_unprepare(host->dbclk); - clk_put(host->dbclk); - } omap_hsmmc_gpio_free(host->pdata); - iounmap(host->base); mmc_free_host(host->mmc); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res) - release_mem_region(res->start, resource_size(res)); - return 0; } diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 0b9ded13a3ae..0d519649b575 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -236,6 +236,9 @@ static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host, case MMC_RSP_R1: rsp_type = SD_RSP_TYPE_R1; break; + case MMC_RSP_R1 & ~MMC_RSP_CRC: + rsp_type = SD_RSP_TYPE_R1 | SD_NO_CHECK_CRC7; + break; case MMC_RSP_R1B: rsp_type = SD_RSP_TYPE_R1b; break; @@ -816,6 +819,7 @@ static int sd_set_timing(struct realtek_pci_sdmmc *host, unsigned char timing) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL, CLK_LOW_FREQ, 0); break; + case MMC_TIMING_MMC_DDR52: case MMC_TIMING_UHS_DDR50: rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG1, 0x0C | SD_ASYNC_FIFO_NOT_RST, @@ -896,6 +900,7 @@ static void sdmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->vpclk = true; host->double_clk = false; break; + case MMC_TIMING_MMC_DDR52: case MMC_TIMING_UHS_DDR50: case MMC_TIMING_UHS_SDR25: host->ssc_depth = RTSX_SSC_DEPTH_1M; diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index e11fafa6fc6b..5d3766e792f0 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -34,7 +34,8 @@ #include <linux/mfd/rtsx_usb.h> #include <asm/unaligned.h> -#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE) +#if defined(CONFIG_LEDS_CLASS) || (defined(CONFIG_LEDS_CLASS_MODULE) && \ + defined(CONFIG_MMC_REALTEK_USB_MODULE)) #include <linux/leds.h> #include <linux/workqueue.h> #define RTSX_USB_USE_LEDS_CLASS @@ -59,7 +60,7 @@ struct rtsx_usb_sdmmc { unsigned char power_mode; -#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE) +#ifdef RTSX_USB_USE_LEDS_CLASS struct led_classdev led; char led_name[32]; struct work_struct led_work; diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index ebb3f392b589..8ce3c28cb76e 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -102,11 +102,19 @@ static void sdhci_acpi_int_hw_reset(struct sdhci_host *host) } static const struct sdhci_ops sdhci_acpi_ops_dflt = { + .set_clock = sdhci_set_clock, .enable_dma = sdhci_acpi_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static const struct sdhci_ops sdhci_acpi_ops_int = { + .set_clock = sdhci_set_clock, .enable_dma = sdhci_acpi_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, .hw_reset = sdhci_acpi_int_hw_reset, }; diff --git a/drivers/mmc/host/sdhci-bcm-kona.c b/drivers/mmc/host/sdhci-bcm-kona.c index 6f166e63b817..dd780c315a63 100644 --- a/drivers/mmc/host/sdhci-bcm-kona.c +++ b/drivers/mmc/host/sdhci-bcm-kona.c @@ -206,9 +206,13 @@ static void sdhci_bcm_kona_init_74_clocks(struct sdhci_host *host, } static struct sdhci_ops sdhci_bcm_kona_ops = { + .set_clock = sdhci_set_clock, .get_max_clock = sdhci_bcm_kona_get_max_clk, .get_timeout_clock = sdhci_bcm_kona_get_timeout_clock, .platform_send_init_74_clocks = sdhci_bcm_kona_init_74_clocks, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, .card_event = sdhci_bcm_kona_card_event, }; diff --git a/drivers/mmc/host/sdhci-bcm2835.c b/drivers/mmc/host/sdhci-bcm2835.c index f6d8d67c545f..46af9a439d7b 100644 --- a/drivers/mmc/host/sdhci-bcm2835.c +++ b/drivers/mmc/host/sdhci-bcm2835.c @@ -131,8 +131,12 @@ static const struct sdhci_ops bcm2835_sdhci_ops = { .read_l = bcm2835_sdhci_readl, .read_w = bcm2835_sdhci_readw, .read_b = bcm2835_sdhci_readb, + .set_clock = sdhci_set_clock, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_min_clock = bcm2835_sdhci_get_min_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static const struct sdhci_pltfm_data bcm2835_sdhci_pdata = { diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c index f2cc26633cb2..14b74075589a 100644 --- a/drivers/mmc/host/sdhci-cns3xxx.c +++ b/drivers/mmc/host/sdhci-cns3xxx.c @@ -30,13 +30,12 @@ static void sdhci_cns3xxx_set_clock(struct sdhci_host *host, unsigned int clock) u16 clk; unsigned long timeout; - if (clock == host->clock) - return; + host->mmc->actual_clock = 0; sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); if (clock == 0) - goto out; + return; while (host->max_clk / div > clock) { /* @@ -75,13 +74,14 @@ static void sdhci_cns3xxx_set_clock(struct sdhci_host *host, unsigned int clock) clk |= SDHCI_CLOCK_CARD_EN; sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); -out: - host->clock = clock; } static const struct sdhci_ops sdhci_cns3xxx_ops = { .get_max_clock = sdhci_cns3xxx_get_max_clk, .set_clock = sdhci_cns3xxx_set_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static const struct sdhci_pltfm_data sdhci_cns3xxx_pdata = { @@ -90,8 +90,7 @@ static const struct sdhci_pltfm_data sdhci_cns3xxx_pdata = { SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | SDHCI_QUIRK_INVERTED_WRITE_PROTECT | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN | - SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | - SDHCI_QUIRK_NONSTANDARD_CLOCK, + SDHCI_QUIRK_BROKEN_TIMEOUT_VAL, }; static int sdhci_cns3xxx_probe(struct platform_device *pdev) diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c index 736d7a2eb7ec..e6278ec007d7 100644 --- a/drivers/mmc/host/sdhci-dove.c +++ b/drivers/mmc/host/sdhci-dove.c @@ -21,28 +21,17 @@ #include <linux/clk.h> #include <linux/err.h> -#include <linux/gpio.h> #include <linux/io.h> #include <linux/mmc/host.h> #include <linux/module.h> #include <linux/of.h> -#include <linux/of_gpio.h> #include "sdhci-pltfm.h" struct sdhci_dove_priv { struct clk *clk; - int gpio_cd; }; -static irqreturn_t sdhci_dove_carddetect_irq(int irq, void *data) -{ - struct sdhci_host *host = data; - - tasklet_schedule(&host->card_tasklet); - return IRQ_HANDLED; -} - static u16 sdhci_dove_readw(struct sdhci_host *host, int reg) { u16 ret; @@ -60,8 +49,6 @@ static u16 sdhci_dove_readw(struct sdhci_host *host, int reg) static u32 sdhci_dove_readl(struct sdhci_host *host, int reg) { - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_dove_priv *priv = pltfm_host->priv; u32 ret; ret = readl(host->ioaddr + reg); @@ -71,14 +58,6 @@ static u32 sdhci_dove_readl(struct sdhci_host *host, int reg) /* Mask the support for 3.0V */ ret &= ~SDHCI_CAN_VDD_300; break; - case SDHCI_PRESENT_STATE: - if (gpio_is_valid(priv->gpio_cd)) { - if (gpio_get_value(priv->gpio_cd) == 0) - ret |= SDHCI_CARD_PRESENT; - else - ret &= ~SDHCI_CARD_PRESENT; - } - break; } return ret; } @@ -86,6 +65,10 @@ static u32 sdhci_dove_readl(struct sdhci_host *host, int reg) static const struct sdhci_ops sdhci_dove_ops = { .read_w = sdhci_dove_readw, .read_l = sdhci_dove_readl, + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static const struct sdhci_pltfm_data sdhci_dove_pdata = { @@ -113,28 +96,9 @@ static int sdhci_dove_probe(struct platform_device *pdev) priv->clk = devm_clk_get(&pdev->dev, NULL); - if (pdev->dev.of_node) { - priv->gpio_cd = of_get_named_gpio(pdev->dev.of_node, - "cd-gpios", 0); - } else { - priv->gpio_cd = -EINVAL; - } - - if (gpio_is_valid(priv->gpio_cd)) { - ret = gpio_request(priv->gpio_cd, "sdhci-cd"); - if (ret) { - dev_err(&pdev->dev, "card detect gpio request failed: %d\n", - ret); - return ret; - } - gpio_direction_input(priv->gpio_cd); - } - host = sdhci_pltfm_init(pdev, &sdhci_dove_pdata, 0); - if (IS_ERR(host)) { - ret = PTR_ERR(host); - goto err_sdhci_pltfm_init; - } + if (IS_ERR(host)) + return PTR_ERR(host); pltfm_host = sdhci_priv(host); pltfm_host->priv = priv; @@ -142,39 +106,20 @@ static int sdhci_dove_probe(struct platform_device *pdev) if (!IS_ERR(priv->clk)) clk_prepare_enable(priv->clk); - sdhci_get_of_property(pdev); + ret = mmc_of_parse(host->mmc); + if (ret) + goto err_sdhci_add; ret = sdhci_add_host(host); if (ret) goto err_sdhci_add; - /* - * We must request the IRQ after sdhci_add_host(), as the tasklet only - * gets setup in sdhci_add_host() and we oops. - */ - if (gpio_is_valid(priv->gpio_cd)) { - ret = request_irq(gpio_to_irq(priv->gpio_cd), - sdhci_dove_carddetect_irq, - IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, - mmc_hostname(host->mmc), host); - if (ret) { - dev_err(&pdev->dev, "card detect irq request failed: %d\n", - ret); - goto err_request_irq; - } - } - return 0; -err_request_irq: - sdhci_remove_host(host, 0); err_sdhci_add: if (!IS_ERR(priv->clk)) clk_disable_unprepare(priv->clk); sdhci_pltfm_free(pdev); -err_sdhci_pltfm_init: - if (gpio_is_valid(priv->gpio_cd)) - gpio_free(priv->gpio_cd); return ret; } @@ -186,11 +131,6 @@ static int sdhci_dove_remove(struct platform_device *pdev) sdhci_pltfm_unregister(pdev); - if (gpio_is_valid(priv->gpio_cd)) { - free_irq(gpio_to_irq(priv->gpio_cd), host); - gpio_free(priv->gpio_cd); - } - if (!IS_ERR(priv->clk)) clk_disable_unprepare(priv->clk); diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index b841bb7cd371..ccec0e32590f 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -160,7 +160,6 @@ struct pltfm_imx_data { MULTIBLK_IN_PROCESS, /* exact multiblock cmd in process */ WAIT_FOR_INT, /* sent CMD12, waiting for response INT */ } multiblock_status; - u32 uhs_mode; u32 is_ddr; }; @@ -382,7 +381,6 @@ static u16 esdhc_readw_le(struct sdhci_host *host, int reg) if (val & ESDHC_MIX_CTRL_SMPCLK_SEL) ret |= SDHCI_CTRL_TUNED_CLK; - ret |= (imx_data->uhs_mode & SDHCI_CTRL_UHS_MASK); ret &= ~SDHCI_CTRL_PRESET_VAL_ENABLE; return ret; @@ -429,7 +427,6 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg) else new_val &= ~ESDHC_VENDOR_SPEC_VSELECT; writel(new_val, host->ioaddr + ESDHC_VENDOR_SPEC); - imx_data->uhs_mode = val & SDHCI_CTRL_UHS_MASK; if (imx_data->socdata->flags & ESDHC_FLAG_MAN_TUNING) { new_val = readl(host->ioaddr + ESDHC_MIX_CTRL); if (val & SDHCI_CTRL_TUNED_CLK) @@ -600,12 +597,14 @@ static inline void esdhc_pltfm_set_clock(struct sdhci_host *host, u32 temp, val; if (clock == 0) { + host->mmc->actual_clock = 0; + if (esdhc_is_usdhc(imx_data)) { val = readl(host->ioaddr + ESDHC_VENDOR_SPEC); writel(val & ~ESDHC_VENDOR_SPEC_FRC_SDCLK_ON, host->ioaddr + ESDHC_VENDOR_SPEC); } - goto out; + return; } if (esdhc_is_usdhc(imx_data) && !imx_data->is_ddr) @@ -645,8 +644,6 @@ static inline void esdhc_pltfm_set_clock(struct sdhci_host *host, } mdelay(1); -out: - host->clock = clock; } static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host) @@ -668,7 +665,7 @@ static unsigned int esdhc_pltfm_get_ro(struct sdhci_host *host) return -ENOSYS; } -static int esdhc_pltfm_bus_width(struct sdhci_host *host, int width) +static void esdhc_pltfm_set_bus_width(struct sdhci_host *host, int width) { u32 ctrl; @@ -686,8 +683,6 @@ static int esdhc_pltfm_bus_width(struct sdhci_host *host, int width) esdhc_clrset_le(host, ESDHC_CTRL_BUSWIDTH_MASK, ctrl, SDHCI_HOST_CONTROL); - - return 0; } static void esdhc_prepare_tuning(struct sdhci_host *host, u32 val) @@ -697,6 +692,7 @@ static void esdhc_prepare_tuning(struct sdhci_host *host, u32 val) /* FIXME: delay a bit for card to be ready for next tuning due to errors */ mdelay(1); + /* This is balanced by the runtime put in sdhci_tasklet_finish */ pm_runtime_get_sync(host->mmc->parent); reg = readl(host->ioaddr + ESDHC_MIX_CTRL); reg |= ESDHC_MIX_CTRL_EXE_TUNE | ESDHC_MIX_CTRL_SMPCLK_SEL | @@ -713,13 +709,12 @@ static void esdhc_request_done(struct mmc_request *mrq) complete(&mrq->completion); } -static int esdhc_send_tuning_cmd(struct sdhci_host *host, u32 opcode) +static int esdhc_send_tuning_cmd(struct sdhci_host *host, u32 opcode, + struct scatterlist *sg) { struct mmc_command cmd = {0}; struct mmc_request mrq = {NULL}; struct mmc_data data = {0}; - struct scatterlist sg; - char tuning_pattern[ESDHC_TUNING_BLOCK_PATTERN_LEN]; cmd.opcode = opcode; cmd.arg = 0; @@ -728,11 +723,9 @@ static int esdhc_send_tuning_cmd(struct sdhci_host *host, u32 opcode) data.blksz = ESDHC_TUNING_BLOCK_PATTERN_LEN; data.blocks = 1; data.flags = MMC_DATA_READ; - data.sg = &sg; + data.sg = sg; data.sg_len = 1; - sg_init_one(&sg, tuning_pattern, sizeof(tuning_pattern)); - mrq.cmd = &cmd; mrq.cmd->mrq = &mrq; mrq.data = &data; @@ -742,14 +735,12 @@ static int esdhc_send_tuning_cmd(struct sdhci_host *host, u32 opcode) mrq.done = esdhc_request_done; init_completion(&(mrq.completion)); - disable_irq(host->irq); - spin_lock(&host->lock); + spin_lock_irq(&host->lock); host->mrq = &mrq; sdhci_send_command(host, mrq.cmd); - spin_unlock(&host->lock); - enable_irq(host->irq); + spin_unlock_irq(&host->lock); wait_for_completion(&mrq.completion); @@ -772,13 +763,21 @@ static void esdhc_post_tuning(struct sdhci_host *host) static int esdhc_executing_tuning(struct sdhci_host *host, u32 opcode) { + struct scatterlist sg; + char *tuning_pattern; int min, max, avg, ret; + tuning_pattern = kmalloc(ESDHC_TUNING_BLOCK_PATTERN_LEN, GFP_KERNEL); + if (!tuning_pattern) + return -ENOMEM; + + sg_init_one(&sg, tuning_pattern, ESDHC_TUNING_BLOCK_PATTERN_LEN); + /* find the mininum delay first which can pass tuning */ min = ESDHC_TUNE_CTRL_MIN; while (min < ESDHC_TUNE_CTRL_MAX) { esdhc_prepare_tuning(host, min); - if (!esdhc_send_tuning_cmd(host, opcode)) + if (!esdhc_send_tuning_cmd(host, opcode, &sg)) break; min += ESDHC_TUNE_CTRL_STEP; } @@ -787,7 +786,7 @@ static int esdhc_executing_tuning(struct sdhci_host *host, u32 opcode) max = min + ESDHC_TUNE_CTRL_STEP; while (max < ESDHC_TUNE_CTRL_MAX) { esdhc_prepare_tuning(host, max); - if (esdhc_send_tuning_cmd(host, opcode)) { + if (esdhc_send_tuning_cmd(host, opcode, &sg)) { max -= ESDHC_TUNE_CTRL_STEP; break; } @@ -797,9 +796,11 @@ static int esdhc_executing_tuning(struct sdhci_host *host, u32 opcode) /* use average delay to get the best timing */ avg = (min + max) / 2; esdhc_prepare_tuning(host, avg); - ret = esdhc_send_tuning_cmd(host, opcode); + ret = esdhc_send_tuning_cmd(host, opcode, &sg); esdhc_post_tuning(host); + kfree(tuning_pattern); + dev_dbg(mmc_dev(host->mmc), "tunning %s at 0x%x ret %d\n", ret ? "failed" : "passed", avg, ret); @@ -837,28 +838,21 @@ static int esdhc_change_pinstate(struct sdhci_host *host, return pinctrl_select_state(imx_data->pinctrl, pinctrl); } -static int esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) +static void esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned timing) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct pltfm_imx_data *imx_data = pltfm_host->priv; struct esdhc_platform_data *boarddata = &imx_data->boarddata; - switch (uhs) { + switch (timing) { case MMC_TIMING_UHS_SDR12: - imx_data->uhs_mode = SDHCI_CTRL_UHS_SDR12; - break; case MMC_TIMING_UHS_SDR25: - imx_data->uhs_mode = SDHCI_CTRL_UHS_SDR25; - break; case MMC_TIMING_UHS_SDR50: - imx_data->uhs_mode = SDHCI_CTRL_UHS_SDR50; - break; case MMC_TIMING_UHS_SDR104: case MMC_TIMING_MMC_HS200: - imx_data->uhs_mode = SDHCI_CTRL_UHS_SDR104; break; case MMC_TIMING_UHS_DDR50: - imx_data->uhs_mode = SDHCI_CTRL_UHS_DDR50; + case MMC_TIMING_MMC_DDR52: writel(readl(host->ioaddr + ESDHC_MIX_CTRL) | ESDHC_MIX_CTRL_DDREN, host->ioaddr + ESDHC_MIX_CTRL); @@ -875,7 +869,15 @@ static int esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) break; } - return esdhc_change_pinstate(host, uhs); + esdhc_change_pinstate(host, timing); +} + +static void esdhc_reset(struct sdhci_host *host, u8 mask) +{ + sdhci_reset(host, mask); + + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); } static struct sdhci_ops sdhci_esdhc_ops = { @@ -888,8 +890,9 @@ static struct sdhci_ops sdhci_esdhc_ops = { .get_max_clock = esdhc_pltfm_get_max_clock, .get_min_clock = esdhc_pltfm_get_min_clock, .get_ro = esdhc_pltfm_get_ro, - .platform_bus_width = esdhc_pltfm_bus_width, + .set_bus_width = esdhc_pltfm_set_bus_width, .set_uhs_signaling = esdhc_set_uhs_signaling, + .reset = esdhc_reset, }; static const struct sdhci_pltfm_data sdhci_esdhc_imx_pdata = { @@ -1170,8 +1173,10 @@ static int sdhci_esdhc_runtime_suspend(struct device *dev) ret = sdhci_runtime_suspend_host(host); - clk_disable_unprepare(imx_data->clk_per); - clk_disable_unprepare(imx_data->clk_ipg); + if (!sdhci_sdio_irq_enabled(host)) { + clk_disable_unprepare(imx_data->clk_per); + clk_disable_unprepare(imx_data->clk_ipg); + } clk_disable_unprepare(imx_data->clk_ahb); return ret; @@ -1183,8 +1188,10 @@ static int sdhci_esdhc_runtime_resume(struct device *dev) struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct pltfm_imx_data *imx_data = pltfm_host->priv; - clk_prepare_enable(imx_data->clk_per); - clk_prepare_enable(imx_data->clk_ipg); + if (!sdhci_sdio_irq_enabled(host)) { + clk_prepare_enable(imx_data->clk_per); + clk_prepare_enable(imx_data->clk_ipg); + } clk_prepare_enable(imx_data->clk_ahb); return sdhci_runtime_resume_host(host); diff --git a/drivers/mmc/host/sdhci-esdhc.h b/drivers/mmc/host/sdhci-esdhc.h index a7d9f95a7b03..3497cfaf683c 100644 --- a/drivers/mmc/host/sdhci-esdhc.h +++ b/drivers/mmc/host/sdhci-esdhc.h @@ -20,10 +20,8 @@ #define ESDHC_DEFAULT_QUIRKS (SDHCI_QUIRK_FORCE_BLK_SZ_2048 | \ SDHCI_QUIRK_NO_BUSY_IRQ | \ - SDHCI_QUIRK_NONSTANDARD_CLOCK | \ SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | \ - SDHCI_QUIRK_PIO_NEEDS_DELAY | \ - SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) + SDHCI_QUIRK_PIO_NEEDS_DELAY) #define ESDHC_SYSTEM_CONTROL 0x2c #define ESDHC_CLOCK_MASK 0x0000fff0 diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index f7c7cf62437d..5bd1092310f2 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -52,8 +52,12 @@ static unsigned int sdhci_arasan_get_timeout_clock(struct sdhci_host *host) } static struct sdhci_ops sdhci_arasan_ops = { + .set_clock = sdhci_set_clock, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_arasan_get_timeout_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static struct sdhci_pltfm_data sdhci_arasan_pdata = { diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 0b249970b119..8be4dcfb49a0 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -199,13 +199,14 @@ static unsigned int esdhc_of_get_min_clock(struct sdhci_host *host) static void esdhc_of_set_clock(struct sdhci_host *host, unsigned int clock) { - int pre_div = 2; int div = 1; u32 temp; + host->mmc->actual_clock = 0; + if (clock == 0) - goto out; + return; /* Workaround to reduce the clock frequency for p1010 esdhc */ if (of_find_compatible_node(NULL, NULL, "fsl,p1010-esdhc")) { @@ -238,24 +239,8 @@ static void esdhc_of_set_clock(struct sdhci_host *host, unsigned int clock) | (pre_div << ESDHC_PREDIV_SHIFT)); sdhci_writel(host, temp, ESDHC_SYSTEM_CONTROL); mdelay(1); -out: - host->clock = clock; -} - -#ifdef CONFIG_PM -static u32 esdhc_proctl; -static void esdhc_of_suspend(struct sdhci_host *host) -{ - esdhc_proctl = sdhci_be32bs_readl(host, SDHCI_HOST_CONTROL); } -static void esdhc_of_resume(struct sdhci_host *host) -{ - esdhc_of_enable_dma(host); - sdhci_be32bs_writel(host, esdhc_proctl, SDHCI_HOST_CONTROL); -} -#endif - static void esdhc_of_platform_init(struct sdhci_host *host) { u32 vvn; @@ -269,7 +254,7 @@ static void esdhc_of_platform_init(struct sdhci_host *host) host->quirks &= ~SDHCI_QUIRK_NO_BUSY_IRQ; } -static int esdhc_pltfm_bus_width(struct sdhci_host *host, int width) +static void esdhc_pltfm_set_bus_width(struct sdhci_host *host, int width) { u32 ctrl; @@ -289,8 +274,6 @@ static int esdhc_pltfm_bus_width(struct sdhci_host *host, int width) clrsetbits_be32(host->ioaddr + SDHCI_HOST_CONTROL, ESDHC_CTRL_BUSWIDTH_MASK, ctrl); - - return 0; } static const struct sdhci_ops sdhci_esdhc_ops = { @@ -305,13 +288,46 @@ static const struct sdhci_ops sdhci_esdhc_ops = { .get_max_clock = esdhc_of_get_max_clock, .get_min_clock = esdhc_of_get_min_clock, .platform_init = esdhc_of_platform_init, -#ifdef CONFIG_PM - .platform_suspend = esdhc_of_suspend, - .platform_resume = esdhc_of_resume, -#endif .adma_workaround = esdhci_of_adma_workaround, - .platform_bus_width = esdhc_pltfm_bus_width, + .set_bus_width = esdhc_pltfm_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, +}; + +#ifdef CONFIG_PM + +static u32 esdhc_proctl; +static int esdhc_of_suspend(struct device *dev) +{ + struct sdhci_host *host = dev_get_drvdata(dev); + + esdhc_proctl = sdhci_be32bs_readl(host, SDHCI_HOST_CONTROL); + + return sdhci_suspend_host(host); +} + +static int esdhc_of_resume(struct device *dev) +{ + struct sdhci_host *host = dev_get_drvdata(dev); + int ret = sdhci_resume_host(host); + + if (ret == 0) { + /* Isn't this already done by sdhci_resume_host() ? --rmk */ + esdhc_of_enable_dma(host); + sdhci_be32bs_writel(host, esdhc_proctl, SDHCI_HOST_CONTROL); + } + + return ret; +} + +static const struct dev_pm_ops esdhc_pmops = { + .suspend = esdhc_of_suspend, + .resume = esdhc_of_resume, }; +#define ESDHC_PMOPS (&esdhc_pmops) +#else +#define ESDHC_PMOPS NULL +#endif static const struct sdhci_pltfm_data sdhci_esdhc_pdata = { /* @@ -374,7 +390,7 @@ static struct platform_driver sdhci_esdhc_driver = { .name = "sdhci-esdhc", .owner = THIS_MODULE, .of_match_table = sdhci_esdhc_of_match, - .pm = SDHCI_PLTFM_PMOPS, + .pm = ESDHC_PMOPS, }, .probe = sdhci_esdhc_probe, .remove = sdhci_esdhc_remove, diff --git a/drivers/mmc/host/sdhci-of-hlwd.c b/drivers/mmc/host/sdhci-of-hlwd.c index 57c514a81ca5..b341661369a2 100644 --- a/drivers/mmc/host/sdhci-of-hlwd.c +++ b/drivers/mmc/host/sdhci-of-hlwd.c @@ -58,6 +58,10 @@ static const struct sdhci_ops sdhci_hlwd_ops = { .write_l = sdhci_hlwd_writel, .write_w = sdhci_hlwd_writew, .write_b = sdhci_hlwd_writeb, + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static const struct sdhci_pltfm_data sdhci_hlwd_pdata = { diff --git a/drivers/mmc/host/sdhci-pci-o2micro.c b/drivers/mmc/host/sdhci-pci-o2micro.c index f49666bcc52a..5670e381b0cf 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.c +++ b/drivers/mmc/host/sdhci-pci-o2micro.c @@ -21,6 +21,45 @@ #include "sdhci-pci.h" #include "sdhci-pci-o2micro.h" +static void o2_pci_set_baseclk(struct sdhci_pci_chip *chip, u32 value) +{ + u32 scratch_32; + pci_read_config_dword(chip->pdev, + O2_SD_PLL_SETTING, &scratch_32); + + scratch_32 &= 0x0000FFFF; + scratch_32 |= value; + + pci_write_config_dword(chip->pdev, + O2_SD_PLL_SETTING, scratch_32); +} + +static void o2_pci_led_enable(struct sdhci_pci_chip *chip) +{ + int ret; + u32 scratch_32; + + /* Set led of SD host function enable */ + ret = pci_read_config_dword(chip->pdev, + O2_SD_FUNC_REG0, &scratch_32); + if (ret) + return; + + scratch_32 &= ~O2_SD_FREG0_LEDOFF; + pci_write_config_dword(chip->pdev, + O2_SD_FUNC_REG0, scratch_32); + + ret = pci_read_config_dword(chip->pdev, + O2_SD_TEST_REG, &scratch_32); + if (ret) + return; + + scratch_32 |= O2_SD_LED_ENABLE; + pci_write_config_dword(chip->pdev, + O2_SD_TEST_REG, scratch_32); + +} + void sdhci_pci_o2_fujin2_pci_init(struct sdhci_pci_chip *chip) { u32 scratch_32; @@ -216,6 +255,40 @@ int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) scratch &= 0x7f; pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); + /* DevId=8520 subId= 0x11 or 0x12 Type Chip support */ + if (chip->pdev->device == PCI_DEVICE_ID_O2_FUJIN2) { + ret = pci_read_config_dword(chip->pdev, + O2_SD_FUNC_REG0, + &scratch_32); + scratch_32 = ((scratch_32 & 0xFF000000) >> 24); + + /* Check Whether subId is 0x11 or 0x12 */ + if ((scratch_32 == 0x11) || (scratch_32 == 0x12)) { + scratch_32 = 0x2c280000; + + /* Set Base Clock to 208MZ */ + o2_pci_set_baseclk(chip, scratch_32); + ret = pci_read_config_dword(chip->pdev, + O2_SD_FUNC_REG4, + &scratch_32); + + /* Enable Base Clk setting change */ + scratch_32 |= O2_SD_FREG4_ENABLE_CLK_SET; + pci_write_config_dword(chip->pdev, + O2_SD_FUNC_REG4, + scratch_32); + + /* Set Tuning Window to 4 */ + pci_write_config_byte(chip->pdev, + O2_SD_TUNING_CTRL, 0x44); + + break; + } + } + + /* Enable 8520 led function */ + o2_pci_led_enable(chip); + /* Set timeout CLK */ ret = pci_read_config_dword(chip->pdev, O2_SD_CLK_SETTING, &scratch_32); @@ -276,7 +349,7 @@ int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch); ret = pci_read_config_dword(chip->pdev, - O2_SD_FUNC_REG0, &scratch_32); + O2_SD_PLL_SETTING, &scratch_32); if ((scratch_32 & 0xff000000) == 0x01000000) { scratch_32 &= 0x0000FFFF; @@ -299,6 +372,9 @@ int sdhci_pci_o2_probe(struct sdhci_pci_chip *chip) O2_SD_FUNC_REG4, scratch_32); } + /* Set Tuning Windows to 5 */ + pci_write_config_byte(chip->pdev, + O2_SD_TUNING_CTRL, 0x55); /* Lock WP */ ret = pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch); diff --git a/drivers/mmc/host/sdhci-pci-o2micro.h b/drivers/mmc/host/sdhci-pci-o2micro.h index dbec4c933488..f7ffc908d9a0 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.h +++ b/drivers/mmc/host/sdhci-pci-o2micro.h @@ -57,6 +57,9 @@ #define O2_SD_UHS2_L1_CTRL 0x35C #define O2_SD_FUNC_REG3 0x3E0 #define O2_SD_FUNC_REG4 0x3E4 +#define O2_SD_LED_ENABLE BIT(6) +#define O2_SD_FREG0_LEDOFF BIT(13) +#define O2_SD_FREG4_ENABLE_CLK_SET BIT(22) #define O2_SD_VENDOR_SETTING 0x110 #define O2_SD_VENDOR_SETTING2 0x1C8 diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index fdc612120362..52c42fcc284c 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -1031,7 +1031,7 @@ static int sdhci_pci_enable_dma(struct sdhci_host *host) return 0; } -static int sdhci_pci_bus_width(struct sdhci_host *host, int width) +static void sdhci_pci_set_bus_width(struct sdhci_host *host, int width) { u8 ctrl; @@ -1052,8 +1052,6 @@ static int sdhci_pci_bus_width(struct sdhci_host *host, int width) } sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); - - return 0; } static void sdhci_pci_gpio_hw_reset(struct sdhci_host *host) @@ -1080,8 +1078,11 @@ static void sdhci_pci_hw_reset(struct sdhci_host *host) } static const struct sdhci_ops sdhci_pci_ops = { + .set_clock = sdhci_set_clock, .enable_dma = sdhci_pci_enable_dma, - .platform_bus_width = sdhci_pci_bus_width, + .set_bus_width = sdhci_pci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, .hw_reset = sdhci_pci_hw_reset, }; diff --git a/drivers/mmc/host/sdhci-pltfm.c b/drivers/mmc/host/sdhci-pltfm.c index bef250e95418..7e834fb78f42 100644 --- a/drivers/mmc/host/sdhci-pltfm.c +++ b/drivers/mmc/host/sdhci-pltfm.c @@ -45,6 +45,10 @@ unsigned int sdhci_pltfm_clk_get_max_clock(struct sdhci_host *host) EXPORT_SYMBOL_GPL(sdhci_pltfm_clk_get_max_clock); static const struct sdhci_ops sdhci_pltfm_ops = { + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; #ifdef CONFIG_OF diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c index d51e061ec576..3c0f3c0a1cc8 100644 --- a/drivers/mmc/host/sdhci-pxav2.c +++ b/drivers/mmc/host/sdhci-pxav2.c @@ -51,11 +51,13 @@ #define MMC_CARD 0x1000 #define MMC_WIDTH 0x0100 -static void pxav2_set_private_registers(struct sdhci_host *host, u8 mask) +static void pxav2_reset(struct sdhci_host *host, u8 mask) { struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + sdhci_reset(host, mask); + if (mask == SDHCI_RESET_ALL) { u16 tmp = 0; @@ -88,7 +90,7 @@ static void pxav2_set_private_registers(struct sdhci_host *host, u8 mask) } } -static int pxav2_mmc_set_width(struct sdhci_host *host, int width) +static void pxav2_mmc_set_bus_width(struct sdhci_host *host, int width) { u8 ctrl; u16 tmp; @@ -107,14 +109,14 @@ static int pxav2_mmc_set_width(struct sdhci_host *host, int width) } writew(tmp, host->ioaddr + SD_CE_ATA_2); writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); - - return 0; } static const struct sdhci_ops pxav2_sdhci_ops = { + .set_clock = sdhci_set_clock, .get_max_clock = sdhci_pltfm_clk_get_max_clock, - .platform_reset_exit = pxav2_set_private_registers, - .platform_bus_width = pxav2_mmc_set_width, + .set_bus_width = pxav2_mmc_set_bus_width, + .reset = pxav2_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; #ifdef CONFIG_OF diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index 2fd73b38c303..f4f128947561 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -112,11 +112,13 @@ static int mv_conf_mbus_windows(struct platform_device *pdev, return 0; } -static void pxav3_set_private_registers(struct sdhci_host *host, u8 mask) +static void pxav3_reset(struct sdhci_host *host, u8 mask) { struct platform_device *pdev = to_platform_device(mmc_dev(host->mmc)); struct sdhci_pxa_platdata *pdata = pdev->dev.platform_data; + sdhci_reset(host, mask); + if (mask == SDHCI_RESET_ALL) { /* * tune timing of read data/command when crc error happen @@ -184,7 +186,7 @@ static void pxav3_gen_init_74_clocks(struct sdhci_host *host, u8 power_mode) pxa->power_mode = power_mode; } -static int pxav3_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) +static void pxav3_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) { u16 ctrl_2; @@ -218,15 +220,16 @@ static int pxav3_set_uhs_signaling(struct sdhci_host *host, unsigned int uhs) dev_dbg(mmc_dev(host->mmc), "%s uhs = %d, ctrl_2 = %04X\n", __func__, uhs, ctrl_2); - - return 0; } static const struct sdhci_ops pxav3_sdhci_ops = { - .platform_reset_exit = pxav3_set_private_registers, + .set_clock = sdhci_set_clock, .set_uhs_signaling = pxav3_set_uhs_signaling, .platform_send_init_74_clocks = pxav3_gen_init_74_clocks, .get_max_clock = sdhci_pltfm_clk_get_max_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = pxav3_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static struct sdhci_pltfm_data sdhci_pxav3_pdata = { diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index d61eb5a70833..fa5954a05449 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -33,9 +33,6 @@ #define MAX_BUS_CLK (4) -/* Number of gpio's used is max data bus width + command and clock lines */ -#define NUM_GPIOS(x) (x + 2) - /** * struct sdhci_s3c - S3C SDHCI instance * @host: The SDHCI host created @@ -58,6 +55,8 @@ struct sdhci_s3c { struct clk *clk_io; struct clk *clk_bus[MAX_BUS_CLK]; unsigned long clk_rates[MAX_BUS_CLK]; + + bool no_divider; }; /** @@ -70,6 +69,7 @@ struct sdhci_s3c { */ struct sdhci_s3c_drv_data { unsigned int sdhci_quirks; + bool no_divider; }; static inline struct sdhci_s3c *to_s3c(struct sdhci_host *host) @@ -119,7 +119,7 @@ static unsigned int sdhci_s3c_consider_clock(struct sdhci_s3c *ourhost, * If controller uses a non-standard clock division, find the best clock * speed possible with selected clock source and skip the division. */ - if (ourhost->host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) { + if (ourhost->no_divider) { rate = clk_round_rate(clksrc, wanted); return wanted - rate; } @@ -161,9 +161,13 @@ static void sdhci_s3c_set_clock(struct sdhci_host *host, unsigned int clock) int src; u32 ctrl; + host->mmc->actual_clock = 0; + /* don't bother if the clock is going off. */ - if (clock == 0) + if (clock == 0) { + sdhci_set_clock(host, clock); return; + } for (src = 0; src < MAX_BUS_CLK; src++) { delta = sdhci_s3c_consider_clock(ourhost, src, clock); @@ -215,6 +219,8 @@ static void sdhci_s3c_set_clock(struct sdhci_host *host, unsigned int clock) if (clock < 25 * 1000000) ctrl |= (S3C_SDHCI_CTRL3_FCSEL3 | S3C_SDHCI_CTRL3_FCSEL2); writel(ctrl, host->ioaddr + S3C_SDHCI_CONTROL3); + + sdhci_set_clock(host, clock); } /** @@ -295,10 +301,11 @@ static void sdhci_cmu_set_clock(struct sdhci_host *host, unsigned int clock) unsigned long timeout; u16 clk = 0; + host->mmc->actual_clock = 0; + /* If the clock is going off, set to 0 at clock control register */ if (clock == 0) { sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); - host->clock = clock; return; } @@ -306,8 +313,6 @@ static void sdhci_cmu_set_clock(struct sdhci_host *host, unsigned int clock) clk_set_rate(ourhost->clk_bus[ourhost->cur_clk], clock); - host->clock = clock; - clk = SDHCI_CLOCK_INT_EN; sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); @@ -329,14 +334,14 @@ static void sdhci_cmu_set_clock(struct sdhci_host *host, unsigned int clock) } /** - * sdhci_s3c_platform_bus_width - support 8bit buswidth + * sdhci_s3c_set_bus_width - support 8bit buswidth * @host: The SDHCI host being queried * @width: MMC_BUS_WIDTH_ macro for the bus width being requested * * We have 8-bit width support but is not a v3 controller. * So we add platform_bus_width() and support 8bit width. */ -static int sdhci_s3c_platform_bus_width(struct sdhci_host *host, int width) +static void sdhci_s3c_set_bus_width(struct sdhci_host *host, int width) { u8 ctrl; @@ -358,93 +363,23 @@ static int sdhci_s3c_platform_bus_width(struct sdhci_host *host, int width) } sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); - - return 0; } static struct sdhci_ops sdhci_s3c_ops = { .get_max_clock = sdhci_s3c_get_max_clk, .set_clock = sdhci_s3c_set_clock, .get_min_clock = sdhci_s3c_get_min_clock, - .platform_bus_width = sdhci_s3c_platform_bus_width, + .set_bus_width = sdhci_s3c_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; -static void sdhci_s3c_notify_change(struct platform_device *dev, int state) -{ - struct sdhci_host *host = platform_get_drvdata(dev); -#ifdef CONFIG_PM_RUNTIME - struct sdhci_s3c *sc = sdhci_priv(host); -#endif - unsigned long flags; - - if (host) { - spin_lock_irqsave(&host->lock, flags); - if (state) { - dev_dbg(&dev->dev, "card inserted.\n"); -#ifdef CONFIG_PM_RUNTIME - clk_prepare_enable(sc->clk_io); -#endif - host->flags &= ~SDHCI_DEVICE_DEAD; - host->quirks |= SDHCI_QUIRK_BROKEN_CARD_DETECTION; - } else { - dev_dbg(&dev->dev, "card removed.\n"); - host->flags |= SDHCI_DEVICE_DEAD; - host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION; -#ifdef CONFIG_PM_RUNTIME - clk_disable_unprepare(sc->clk_io); -#endif - } - tasklet_schedule(&host->card_tasklet); - spin_unlock_irqrestore(&host->lock, flags); - } -} - -static irqreturn_t sdhci_s3c_gpio_card_detect_thread(int irq, void *dev_id) -{ - struct sdhci_s3c *sc = dev_id; - int status = gpio_get_value(sc->ext_cd_gpio); - if (sc->pdata->ext_cd_gpio_invert) - status = !status; - sdhci_s3c_notify_change(sc->pdev, status); - return IRQ_HANDLED; -} - -static void sdhci_s3c_setup_card_detect_gpio(struct sdhci_s3c *sc) -{ - struct s3c_sdhci_platdata *pdata = sc->pdata; - struct device *dev = &sc->pdev->dev; - - if (devm_gpio_request(dev, pdata->ext_cd_gpio, "SDHCI EXT CD") == 0) { - sc->ext_cd_gpio = pdata->ext_cd_gpio; - sc->ext_cd_irq = gpio_to_irq(pdata->ext_cd_gpio); - if (sc->ext_cd_irq && - request_threaded_irq(sc->ext_cd_irq, NULL, - sdhci_s3c_gpio_card_detect_thread, - IRQF_TRIGGER_RISING | - IRQF_TRIGGER_FALLING | - IRQF_ONESHOT, - dev_name(dev), sc) == 0) { - int status = gpio_get_value(sc->ext_cd_gpio); - if (pdata->ext_cd_gpio_invert) - status = !status; - sdhci_s3c_notify_change(sc->pdev, status); - } else { - dev_warn(dev, "cannot request irq for card detect\n"); - sc->ext_cd_irq = 0; - } - } else { - dev_err(dev, "cannot request gpio for card detect\n"); - } -} - #ifdef CONFIG_OF static int sdhci_s3c_parse_dt(struct device *dev, struct sdhci_host *host, struct s3c_sdhci_platdata *pdata) { struct device_node *node = dev->of_node; - struct sdhci_s3c *ourhost = to_s3c(host); u32 max_width; - int gpio; /* if the bus-width property is not specified, assume width as 1 */ if (of_property_read_u32(node, "bus-width", &max_width)) @@ -462,18 +397,8 @@ static int sdhci_s3c_parse_dt(struct device *dev, return 0; } - gpio = of_get_named_gpio(node, "cd-gpios", 0); - if (gpio_is_valid(gpio)) { - pdata->cd_type = S3C_SDHCI_CD_GPIO; - pdata->ext_cd_gpio = gpio; - ourhost->ext_cd_gpio = -1; - if (of_get_property(node, "cd-inverted", NULL)) - pdata->ext_cd_gpio_invert = 1; + if (of_get_named_gpio(node, "cd-gpios", 0)) return 0; - } else if (gpio != -ENOENT) { - dev_err(dev, "invalid card detect gpio specified\n"); - return -EINVAL; - } /* assuming internal card detect that will be configured by pinctrl */ pdata->cd_type = S3C_SDHCI_CD_INTERNAL; @@ -606,8 +531,10 @@ static int sdhci_s3c_probe(struct platform_device *pdev) /* Setup quirks for the controller */ host->quirks |= SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC; host->quirks |= SDHCI_QUIRK_NO_HISPD_BIT; - if (drv_data) + if (drv_data) { host->quirks |= drv_data->sdhci_quirks; + sc->no_divider = drv_data->no_divider; + } #ifndef CONFIG_MMC_SDHCI_S3C_DMA @@ -656,7 +583,7 @@ static int sdhci_s3c_probe(struct platform_device *pdev) * If controller does not have internal clock divider, * we can use overriding functions instead of default. */ - if (host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) { + if (sc->no_divider) { sdhci_s3c_ops.set_clock = sdhci_cmu_set_clock; sdhci_s3c_ops.get_min_clock = sdhci_cmu_get_min_clock; sdhci_s3c_ops.get_max_clock = sdhci_cmu_get_max_clock; @@ -674,6 +601,8 @@ static int sdhci_s3c_probe(struct platform_device *pdev) pm_runtime_use_autosuspend(&pdev->dev); pm_suspend_ignore_children(&pdev->dev, 1); + mmc_of_parse(host->mmc); + ret = sdhci_add_host(host); if (ret) { dev_err(dev, "sdhci_add_host() failed\n"); @@ -682,15 +611,6 @@ static int sdhci_s3c_probe(struct platform_device *pdev) goto err_req_regs; } - /* The following two methods of card detection might call - sdhci_s3c_notify_change() immediately, so they can be called - only after sdhci_add_host(). Setup errors are ignored. */ - if (pdata->cd_type == S3C_SDHCI_CD_EXTERNAL && pdata->ext_cd_init) - pdata->ext_cd_init(&sdhci_s3c_notify_change); - if (pdata->cd_type == S3C_SDHCI_CD_GPIO && - gpio_is_valid(pdata->ext_cd_gpio)) - sdhci_s3c_setup_card_detect_gpio(sc); - #ifdef CONFIG_PM_RUNTIME if (pdata->cd_type != S3C_SDHCI_CD_INTERNAL) clk_disable_unprepare(sc->clk_io); @@ -711,16 +631,12 @@ static int sdhci_s3c_remove(struct platform_device *pdev) { struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_s3c *sc = sdhci_priv(host); - struct s3c_sdhci_platdata *pdata = sc->pdata; - - if (pdata->cd_type == S3C_SDHCI_CD_EXTERNAL && pdata->ext_cd_cleanup) - pdata->ext_cd_cleanup(&sdhci_s3c_notify_change); if (sc->ext_cd_irq) free_irq(sc->ext_cd_irq, sc); #ifdef CONFIG_PM_RUNTIME - if (pdata->cd_type != S3C_SDHCI_CD_INTERNAL) + if (sc->pdata->cd_type != S3C_SDHCI_CD_INTERNAL) clk_prepare_enable(sc->clk_io); #endif sdhci_remove_host(host, 1); @@ -797,7 +713,7 @@ static const struct dev_pm_ops sdhci_s3c_pmops = { #if defined(CONFIG_CPU_EXYNOS4210) || defined(CONFIG_SOC_EXYNOS4212) static struct sdhci_s3c_drv_data exynos4_sdhci_drv_data = { - .sdhci_quirks = SDHCI_QUIRK_NONSTANDARD_CLOCK, + .no_divider = true, }; #define EXYNOS4_SDHCI_DRV_DATA ((kernel_ulong_t)&exynos4_sdhci_drv_data) #else diff --git a/drivers/mmc/host/sdhci-sirf.c b/drivers/mmc/host/sdhci-sirf.c index 696122c1b468..17004531d089 100644 --- a/drivers/mmc/host/sdhci-sirf.c +++ b/drivers/mmc/host/sdhci-sirf.c @@ -28,7 +28,11 @@ static unsigned int sdhci_sirf_get_max_clk(struct sdhci_host *host) } static struct sdhci_ops sdhci_sirf_ops = { + .set_clock = sdhci_set_clock, .get_max_clock = sdhci_sirf_get_max_clk, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; static struct sdhci_pltfm_data sdhci_sirf_pdata = { diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index 0316dec3f006..9d535c7336ef 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -38,7 +38,10 @@ struct spear_sdhci { /* sdhci ops */ static const struct sdhci_ops sdhci_pltfm_ops = { - /* Nothing to do for now. */ + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, }; #ifdef CONFIG_OF diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index a835898a68dd..d93a063a36f3 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -32,11 +32,17 @@ /* Tegra SDHOST controller vendor register definitions */ #define SDHCI_TEGRA_VENDOR_MISC_CTRL 0x120 +#define SDHCI_MISC_CTRL_ENABLE_SDR104 0x8 +#define SDHCI_MISC_CTRL_ENABLE_SDR50 0x10 #define SDHCI_MISC_CTRL_ENABLE_SDHCI_SPEC_300 0x20 +#define SDHCI_MISC_CTRL_ENABLE_DDR50 0x200 #define NVQUIRK_FORCE_SDHCI_SPEC_200 BIT(0) #define NVQUIRK_ENABLE_BLOCK_GAP_DET BIT(1) #define NVQUIRK_ENABLE_SDHCI_SPEC_300 BIT(2) +#define NVQUIRK_DISABLE_SDR50 BIT(3) +#define NVQUIRK_DISABLE_SDR104 BIT(4) +#define NVQUIRK_DISABLE_DDR50 BIT(5) struct sdhci_tegra_soc_data { const struct sdhci_pltfm_data *pdata; @@ -48,19 +54,6 @@ struct sdhci_tegra { int power_gpio; }; -static u32 tegra_sdhci_readl(struct sdhci_host *host, int reg) -{ - u32 val; - - if (unlikely(reg == SDHCI_PRESENT_STATE)) { - /* Use wp_gpio here instead? */ - val = readl(host->ioaddr + reg); - return val | SDHCI_WRITE_PROTECT; - } - - return readl(host->ioaddr + reg); -} - static u16 tegra_sdhci_readw(struct sdhci_host *host, int reg) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -108,26 +101,33 @@ static unsigned int tegra_sdhci_get_ro(struct sdhci_host *host) return mmc_gpio_get_ro(host->mmc); } -static void tegra_sdhci_reset_exit(struct sdhci_host *host, u8 mask) +static void tegra_sdhci_reset(struct sdhci_host *host, u8 mask) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_tegra *tegra_host = pltfm_host->priv; const struct sdhci_tegra_soc_data *soc_data = tegra_host->soc_data; + u32 misc_ctrl; + + sdhci_reset(host, mask); if (!(mask & SDHCI_RESET_ALL)) return; + misc_ctrl = sdhci_readw(host, SDHCI_TEGRA_VENDOR_MISC_CTRL); /* Erratum: Enable SDHCI spec v3.00 support */ - if (soc_data->nvquirks & NVQUIRK_ENABLE_SDHCI_SPEC_300) { - u32 misc_ctrl; - - misc_ctrl = sdhci_readb(host, SDHCI_TEGRA_VENDOR_MISC_CTRL); + if (soc_data->nvquirks & NVQUIRK_ENABLE_SDHCI_SPEC_300) misc_ctrl |= SDHCI_MISC_CTRL_ENABLE_SDHCI_SPEC_300; - sdhci_writeb(host, misc_ctrl, SDHCI_TEGRA_VENDOR_MISC_CTRL); - } + /* Don't advertise UHS modes which aren't supported yet */ + if (soc_data->nvquirks & NVQUIRK_DISABLE_SDR50) + misc_ctrl &= ~SDHCI_MISC_CTRL_ENABLE_SDR50; + if (soc_data->nvquirks & NVQUIRK_DISABLE_DDR50) + misc_ctrl &= ~SDHCI_MISC_CTRL_ENABLE_DDR50; + if (soc_data->nvquirks & NVQUIRK_DISABLE_SDR104) + misc_ctrl &= ~SDHCI_MISC_CTRL_ENABLE_SDR104; + sdhci_writew(host, misc_ctrl, SDHCI_TEGRA_VENDOR_MISC_CTRL); } -static int tegra_sdhci_buswidth(struct sdhci_host *host, int bus_width) +static void tegra_sdhci_set_bus_width(struct sdhci_host *host, int bus_width) { u32 ctrl; @@ -144,23 +144,25 @@ static int tegra_sdhci_buswidth(struct sdhci_host *host, int bus_width) ctrl &= ~SDHCI_CTRL_4BITBUS; } sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); - return 0; } static const struct sdhci_ops tegra_sdhci_ops = { .get_ro = tegra_sdhci_get_ro, - .read_l = tegra_sdhci_readl, .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, - .platform_bus_width = tegra_sdhci_buswidth, - .platform_reset_exit = tegra_sdhci_reset_exit, + .set_clock = sdhci_set_clock, + .set_bus_width = tegra_sdhci_set_bus_width, + .reset = tegra_sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .get_max_clock = sdhci_pltfm_clk_get_max_clock, }; static const struct sdhci_pltfm_data sdhci_tegra20_pdata = { .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, + SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | + SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .ops = &tegra_sdhci_ops, }; @@ -175,13 +177,16 @@ static const struct sdhci_pltfm_data sdhci_tegra30_pdata = { SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, + SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | + SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .ops = &tegra_sdhci_ops, }; static struct sdhci_tegra_soc_data soc_data_tegra30 = { .pdata = &sdhci_tegra30_pdata, - .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300, + .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | + NVQUIRK_DISABLE_SDR50 | + NVQUIRK_DISABLE_SDR104, }; static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { @@ -189,12 +194,16 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC, + SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | + SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .ops = &tegra_sdhci_ops, }; static struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, + .nvquirks = NVQUIRK_DISABLE_SDR50 | + NVQUIRK_DISABLE_DDR50 | + NVQUIRK_DISABLE_SDR104, }; static const struct of_device_id sdhci_tegra_dt_match[] = { diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 9a79fc4b60ca..47055f3f01b8 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -44,6 +44,8 @@ #define MAX_TUNING_LOOP 40 +#define ADMA_SIZE ((128 * 2 + 1) * 4) + static unsigned int debug_quirks = 0; static unsigned int debug_quirks2; @@ -131,43 +133,26 @@ static void sdhci_dumpregs(struct sdhci_host *host) * * \*****************************************************************************/ -static void sdhci_clear_set_irqs(struct sdhci_host *host, u32 clear, u32 set) -{ - u32 ier; - - ier = sdhci_readl(host, SDHCI_INT_ENABLE); - ier &= ~clear; - ier |= set; - sdhci_writel(host, ier, SDHCI_INT_ENABLE); - sdhci_writel(host, ier, SDHCI_SIGNAL_ENABLE); -} - -static void sdhci_unmask_irqs(struct sdhci_host *host, u32 irqs) -{ - sdhci_clear_set_irqs(host, 0, irqs); -} - -static void sdhci_mask_irqs(struct sdhci_host *host, u32 irqs) -{ - sdhci_clear_set_irqs(host, irqs, 0); -} - static void sdhci_set_card_detection(struct sdhci_host *host, bool enable) { - u32 present, irqs; + u32 present; if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) || (host->mmc->caps & MMC_CAP_NONREMOVABLE)) return; - present = sdhci_readl(host, SDHCI_PRESENT_STATE) & - SDHCI_CARD_PRESENT; - irqs = present ? SDHCI_INT_CARD_REMOVE : SDHCI_INT_CARD_INSERT; + if (enable) { + present = sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT; - if (enable) - sdhci_unmask_irqs(host, irqs); - else - sdhci_mask_irqs(host, irqs); + host->ier |= present ? SDHCI_INT_CARD_REMOVE : + SDHCI_INT_CARD_INSERT; + } else { + host->ier &= ~(SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT); + } + + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); } static void sdhci_enable_card_detection(struct sdhci_host *host) @@ -180,22 +165,9 @@ static void sdhci_disable_card_detection(struct sdhci_host *host) sdhci_set_card_detection(host, false); } -static void sdhci_reset(struct sdhci_host *host, u8 mask) +void sdhci_reset(struct sdhci_host *host, u8 mask) { unsigned long timeout; - u32 uninitialized_var(ier); - - if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) { - if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & - SDHCI_CARD_PRESENT)) - return; - } - - if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) - ier = sdhci_readl(host, SDHCI_INT_ENABLE); - - if (host->ops->platform_reset_enter) - host->ops->platform_reset_enter(host, mask); sdhci_writeb(host, mask, SDHCI_SOFTWARE_RESET); @@ -220,16 +192,27 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) timeout--; mdelay(1); } +} +EXPORT_SYMBOL_GPL(sdhci_reset); + +static void sdhci_do_reset(struct sdhci_host *host, u8 mask) +{ + if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) { + if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT)) + return; + } - if (host->ops->platform_reset_exit) - host->ops->platform_reset_exit(host, mask); + host->ops->reset(host, mask); - if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) - sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK, ier); + if (mask & SDHCI_RESET_ALL) { + if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { + if (host->ops->enable_dma) + host->ops->enable_dma(host); + } - if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { - if ((host->ops->enable_dma) && (mask & SDHCI_RESET_ALL)) - host->ops->enable_dma(host); + /* Resetting the controller clears many */ + host->preset_enabled = false; } } @@ -238,15 +221,18 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios); static void sdhci_init(struct sdhci_host *host, int soft) { if (soft) - sdhci_reset(host, SDHCI_RESET_CMD|SDHCI_RESET_DATA); + sdhci_do_reset(host, SDHCI_RESET_CMD|SDHCI_RESET_DATA); else - sdhci_reset(host, SDHCI_RESET_ALL); + sdhci_do_reset(host, SDHCI_RESET_ALL); - sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK, - SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT | - SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_INDEX | - SDHCI_INT_END_BIT | SDHCI_INT_CRC | SDHCI_INT_TIMEOUT | - SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE); + host->ier = SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT | + SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT | + SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC | + SDHCI_INT_TIMEOUT | SDHCI_INT_DATA_END | + SDHCI_INT_RESPONSE; + + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); if (soft) { /* force clock reconfiguration */ @@ -502,11 +488,6 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, else direction = DMA_TO_DEVICE; - /* - * The ADMA descriptor table is mapped further down as we - * need to fill it with data first. - */ - host->align_addr = dma_map_single(mmc_dev(host->mmc), host->align_buffer, 128 * 4, direction); if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr)) @@ -567,7 +548,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, * If this triggers then we have a calculation bug * somewhere. :/ */ - WARN_ON((desc - host->adma_desc) > (128 * 2 + 1) * 4); + WARN_ON((desc - host->adma_desc) > ADMA_SIZE); } if (host->quirks & SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC) { @@ -595,17 +576,8 @@ static int sdhci_adma_table_pre(struct sdhci_host *host, host->align_addr, 128 * 4, direction); } - host->adma_addr = dma_map_single(mmc_dev(host->mmc), - host->adma_desc, (128 * 2 + 1) * 4, DMA_TO_DEVICE); - if (dma_mapping_error(mmc_dev(host->mmc), host->adma_addr)) - goto unmap_entries; - BUG_ON(host->adma_addr & 0x3); - return 0; -unmap_entries: - dma_unmap_sg(mmc_dev(host->mmc), data->sg, - data->sg_len, direction); unmap_align: dma_unmap_single(mmc_dev(host->mmc), host->align_addr, 128 * 4, direction); @@ -623,19 +595,25 @@ static void sdhci_adma_table_post(struct sdhci_host *host, u8 *align; char *buffer; unsigned long flags; + bool has_unaligned; if (data->flags & MMC_DATA_READ) direction = DMA_FROM_DEVICE; else direction = DMA_TO_DEVICE; - dma_unmap_single(mmc_dev(host->mmc), host->adma_addr, - (128 * 2 + 1) * 4, DMA_TO_DEVICE); - dma_unmap_single(mmc_dev(host->mmc), host->align_addr, 128 * 4, direction); - if (data->flags & MMC_DATA_READ) { + /* Do a quick scan of the SG list for any unaligned mappings */ + has_unaligned = false; + for_each_sg(data->sg, sg, host->sg_count, i) + if (sg_dma_address(sg) & 3) { + has_unaligned = true; + break; + } + + if (has_unaligned && data->flags & MMC_DATA_READ) { dma_sync_sg_for_cpu(mmc_dev(host->mmc), data->sg, data->sg_len, direction); @@ -721,9 +699,12 @@ static void sdhci_set_transfer_irqs(struct sdhci_host *host) u32 dma_irqs = SDHCI_INT_DMA_END | SDHCI_INT_ADMA_ERROR; if (host->flags & SDHCI_REQ_USE_DMA) - sdhci_clear_set_irqs(host, pio_irqs, dma_irqs); + host->ier = (host->ier & ~pio_irqs) | dma_irqs; else - sdhci_clear_set_irqs(host, dma_irqs, pio_irqs); + host->ier = (host->ier & ~dma_irqs) | pio_irqs; + + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); } static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd) @@ -976,8 +957,8 @@ static void sdhci_finish_data(struct sdhci_host *host) * upon error conditions. */ if (data->error) { - sdhci_reset(host, SDHCI_RESET_CMD); - sdhci_reset(host, SDHCI_RESET_DATA); + sdhci_do_reset(host, SDHCI_RESET_CMD); + sdhci_do_reset(host, SDHCI_RESET_DATA); } sdhci_send_command(host, data->stop); @@ -1107,24 +1088,23 @@ static void sdhci_finish_command(struct sdhci_host *host) static u16 sdhci_get_preset_value(struct sdhci_host *host) { - u16 ctrl, preset = 0; - - ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + u16 preset = 0; - switch (ctrl & SDHCI_CTRL_UHS_MASK) { - case SDHCI_CTRL_UHS_SDR12: + switch (host->timing) { + case MMC_TIMING_UHS_SDR12: preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR12); break; - case SDHCI_CTRL_UHS_SDR25: + case MMC_TIMING_UHS_SDR25: preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR25); break; - case SDHCI_CTRL_UHS_SDR50: + case MMC_TIMING_UHS_SDR50: preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR50); break; - case SDHCI_CTRL_UHS_SDR104: + case MMC_TIMING_UHS_SDR104: + case MMC_TIMING_MMC_HS200: preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR104); break; - case SDHCI_CTRL_UHS_DDR50: + case MMC_TIMING_UHS_DDR50: preset = sdhci_readw(host, SDHCI_PRESET_FOR_DDR50); break; default: @@ -1136,32 +1116,22 @@ static u16 sdhci_get_preset_value(struct sdhci_host *host) return preset; } -static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) +void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) { int div = 0; /* Initialized for compiler warning */ int real_div = div, clk_mul = 1; u16 clk = 0; unsigned long timeout; - if (clock && clock == host->clock) - return; - host->mmc->actual_clock = 0; - if (host->ops->set_clock) { - host->ops->set_clock(host, clock); - if (host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) - return; - } - sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); if (clock == 0) - goto out; + return; if (host->version >= SDHCI_SPEC_300) { - if (sdhci_readw(host, SDHCI_HOST_CONTROL2) & - SDHCI_CTRL_PRESET_VAL_ENABLE) { + if (host->preset_enabled) { u16 pre_val; clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); @@ -1247,26 +1217,16 @@ clock_set: clk |= SDHCI_CLOCK_CARD_EN; sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); - -out: - host->clock = clock; } +EXPORT_SYMBOL_GPL(sdhci_set_clock); -static inline void sdhci_update_clock(struct sdhci_host *host) -{ - unsigned int clock; - - clock = host->clock; - host->clock = 0; - sdhci_set_clock(host, clock); -} - -static int sdhci_set_power(struct sdhci_host *host, unsigned short power) +static void sdhci_set_power(struct sdhci_host *host, unsigned char mode, + unsigned short vdd) { u8 pwr = 0; - if (power != (unsigned short)-1) { - switch (1 << power) { + if (mode != MMC_POWER_OFF) { + switch (1 << vdd) { case MMC_VDD_165_195: pwr = SDHCI_POWER_180; break; @@ -1284,7 +1244,7 @@ static int sdhci_set_power(struct sdhci_host *host, unsigned short power) } if (host->pwr == pwr) - return -1; + return; host->pwr = pwr; @@ -1292,38 +1252,43 @@ static int sdhci_set_power(struct sdhci_host *host, unsigned short power) sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); if (host->quirks2 & SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON) sdhci_runtime_pm_bus_off(host); - return 0; - } - - /* - * Spec says that we should clear the power reg before setting - * a new value. Some controllers don't seem to like this though. - */ - if (!(host->quirks & SDHCI_QUIRK_SINGLE_POWER_WRITE)) - sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + vdd = 0; + } else { + /* + * Spec says that we should clear the power reg before setting + * a new value. Some controllers don't seem to like this though. + */ + if (!(host->quirks & SDHCI_QUIRK_SINGLE_POWER_WRITE)) + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); - /* - * At least the Marvell CaFe chip gets confused if we set the voltage - * and set turn on power at the same time, so set the voltage first. - */ - if (host->quirks & SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER) - sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); + /* + * At least the Marvell CaFe chip gets confused if we set the + * voltage and set turn on power at the same time, so set the + * voltage first. + */ + if (host->quirks & SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER) + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); - pwr |= SDHCI_POWER_ON; + pwr |= SDHCI_POWER_ON; - sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); - if (host->quirks2 & SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON) - sdhci_runtime_pm_bus_on(host); + if (host->quirks2 & SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON) + sdhci_runtime_pm_bus_on(host); - /* - * Some controllers need an extra 10ms delay of 10ms before they - * can apply clock after applying power - */ - if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER) - mdelay(10); + /* + * Some controllers need an extra 10ms delay of 10ms before + * they can apply clock after applying power + */ + if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER) + mdelay(10); + } - return power; + if (host->vmmc) { + spin_unlock_irq(&host->lock); + mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd); + spin_lock_irq(&host->lock); + } } /*****************************************************************************\ @@ -1427,10 +1392,53 @@ static void sdhci_request(struct mmc_host *mmc, struct mmc_request *mrq) spin_unlock_irqrestore(&host->lock, flags); } +void sdhci_set_bus_width(struct sdhci_host *host, int width) +{ + u8 ctrl; + + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); + if (width == MMC_BUS_WIDTH_8) { + ctrl &= ~SDHCI_CTRL_4BITBUS; + if (host->version >= SDHCI_SPEC_300) + ctrl |= SDHCI_CTRL_8BITBUS; + } else { + if (host->version >= SDHCI_SPEC_300) + ctrl &= ~SDHCI_CTRL_8BITBUS; + if (width == MMC_BUS_WIDTH_4) + ctrl |= SDHCI_CTRL_4BITBUS; + else + ctrl &= ~SDHCI_CTRL_4BITBUS; + } + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); +} +EXPORT_SYMBOL_GPL(sdhci_set_bus_width); + +void sdhci_set_uhs_signaling(struct sdhci_host *host, unsigned timing) +{ + u16 ctrl_2; + + ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + /* Select Bus Speed Mode for host */ + ctrl_2 &= ~SDHCI_CTRL_UHS_MASK; + if ((timing == MMC_TIMING_MMC_HS200) || + (timing == MMC_TIMING_UHS_SDR104)) + ctrl_2 |= SDHCI_CTRL_UHS_SDR104; + else if (timing == MMC_TIMING_UHS_SDR12) + ctrl_2 |= SDHCI_CTRL_UHS_SDR12; + else if (timing == MMC_TIMING_UHS_SDR25) + ctrl_2 |= SDHCI_CTRL_UHS_SDR25; + else if (timing == MMC_TIMING_UHS_SDR50) + ctrl_2 |= SDHCI_CTRL_UHS_SDR50; + else if ((timing == MMC_TIMING_UHS_DDR50) || + (timing == MMC_TIMING_MMC_DDR52)) + ctrl_2 |= SDHCI_CTRL_UHS_DDR50; + sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); +} +EXPORT_SYMBOL_GPL(sdhci_set_uhs_signaling); + static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) { unsigned long flags; - int vdd_bit = -1; u8 ctrl; spin_lock_irqsave(&host->lock, flags); @@ -1456,45 +1464,17 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) !(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN)) sdhci_enable_preset_value(host, false); - sdhci_set_clock(host, ios->clock); - - if (ios->power_mode == MMC_POWER_OFF) - vdd_bit = sdhci_set_power(host, -1); - else - vdd_bit = sdhci_set_power(host, ios->vdd); - - if (host->vmmc && vdd_bit != -1) { - spin_unlock_irqrestore(&host->lock, flags); - mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd_bit); - spin_lock_irqsave(&host->lock, flags); + if (!ios->clock || ios->clock != host->clock) { + host->ops->set_clock(host, ios->clock); + host->clock = ios->clock; } + sdhci_set_power(host, ios->power_mode, ios->vdd); + if (host->ops->platform_send_init_74_clocks) host->ops->platform_send_init_74_clocks(host, ios->power_mode); - /* - * If your platform has 8-bit width support but is not a v3 controller, - * or if it requires special setup code, you should implement that in - * platform_bus_width(). - */ - if (host->ops->platform_bus_width) { - host->ops->platform_bus_width(host, ios->bus_width); - } else { - ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); - if (ios->bus_width == MMC_BUS_WIDTH_8) { - ctrl &= ~SDHCI_CTRL_4BITBUS; - if (host->version >= SDHCI_SPEC_300) - ctrl |= SDHCI_CTRL_8BITBUS; - } else { - if (host->version >= SDHCI_SPEC_300) - ctrl &= ~SDHCI_CTRL_8BITBUS; - if (ios->bus_width == MMC_BUS_WIDTH_4) - ctrl |= SDHCI_CTRL_4BITBUS; - else - ctrl &= ~SDHCI_CTRL_4BITBUS; - } - sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); - } + host->ops->set_bus_width(host, ios->bus_width); ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); @@ -1510,19 +1490,20 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) /* In case of UHS-I modes, set High Speed Enable */ if ((ios->timing == MMC_TIMING_MMC_HS200) || + (ios->timing == MMC_TIMING_MMC_DDR52) || (ios->timing == MMC_TIMING_UHS_SDR50) || (ios->timing == MMC_TIMING_UHS_SDR104) || (ios->timing == MMC_TIMING_UHS_DDR50) || (ios->timing == MMC_TIMING_UHS_SDR25)) ctrl |= SDHCI_CTRL_HISPD; - ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); - if (!(ctrl_2 & SDHCI_CTRL_PRESET_VAL_ENABLE)) { + if (!host->preset_enabled) { sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); /* * We only need to set Driver Strength if the * preset value enable is not set. */ + ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); ctrl_2 &= ~SDHCI_CTRL_DRV_TYPE_MASK; if (ios->drv_type == MMC_SET_DRIVER_TYPE_A) ctrl_2 |= SDHCI_CTRL_DRV_TYPE_A; @@ -1546,7 +1527,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); /* Re-enable SD Clock */ - sdhci_update_clock(host); + host->ops->set_clock(host, host->clock); } @@ -1555,25 +1536,8 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) clk &= ~SDHCI_CLOCK_CARD_EN; sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); - if (host->ops->set_uhs_signaling) - host->ops->set_uhs_signaling(host, ios->timing); - else { - ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); - /* Select Bus Speed Mode for host */ - ctrl_2 &= ~SDHCI_CTRL_UHS_MASK; - if ((ios->timing == MMC_TIMING_MMC_HS200) || - (ios->timing == MMC_TIMING_UHS_SDR104)) - ctrl_2 |= SDHCI_CTRL_UHS_SDR104; - else if (ios->timing == MMC_TIMING_UHS_SDR12) - ctrl_2 |= SDHCI_CTRL_UHS_SDR12; - else if (ios->timing == MMC_TIMING_UHS_SDR25) - ctrl_2 |= SDHCI_CTRL_UHS_SDR25; - else if (ios->timing == MMC_TIMING_UHS_SDR50) - ctrl_2 |= SDHCI_CTRL_UHS_SDR50; - else if (ios->timing == MMC_TIMING_UHS_DDR50) - ctrl_2 |= SDHCI_CTRL_UHS_DDR50; - sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); - } + host->ops->set_uhs_signaling(host, ios->timing); + host->timing = ios->timing; if (!(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN) && ((ios->timing == MMC_TIMING_UHS_SDR12) || @@ -1590,7 +1554,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) } /* Re-enable SD Clock */ - sdhci_update_clock(host); + host->ops->set_clock(host, host->clock); } else sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); @@ -1600,7 +1564,7 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) * it on each ios seems to solve the problem. */ if(host->quirks & SDHCI_QUIRK_RESET_CMD_DATA_ON_IOS) - sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA); + sdhci_do_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA); mmiowb(); spin_unlock_irqrestore(&host->lock, flags); @@ -1709,24 +1673,16 @@ static int sdhci_get_ro(struct mmc_host *mmc) static void sdhci_enable_sdio_irq_nolock(struct sdhci_host *host, int enable) { - if (host->flags & SDHCI_DEVICE_DEAD) - goto out; - - if (enable) - host->flags |= SDHCI_SDIO_IRQ_ENABLED; - else - host->flags &= ~SDHCI_SDIO_IRQ_ENABLED; - - /* SDIO IRQ will be enabled as appropriate in runtime resume */ - if (host->runtime_suspended) - goto out; + if (!(host->flags & SDHCI_DEVICE_DEAD)) { + if (enable) + host->ier |= SDHCI_INT_CARD_INT; + else + host->ier &= ~SDHCI_INT_CARD_INT; - if (enable) - sdhci_unmask_irqs(host, SDHCI_INT_CARD_INT); - else - sdhci_mask_irqs(host, SDHCI_INT_CARD_INT); -out: - mmiowb(); + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); + mmiowb(); + } } static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) @@ -1734,9 +1690,18 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) struct sdhci_host *host = mmc_priv(mmc); unsigned long flags; + sdhci_runtime_pm_get(host); + spin_lock_irqsave(&host->lock, flags); + if (enable) + host->flags |= SDHCI_SDIO_IRQ_ENABLED; + else + host->flags &= ~SDHCI_SDIO_IRQ_ENABLED; + sdhci_enable_sdio_irq_nolock(host, enable); spin_unlock_irqrestore(&host->lock, flags); + + sdhci_runtime_pm_put(host); } static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host, @@ -1855,22 +1820,15 @@ static int sdhci_card_busy(struct mmc_host *mmc) static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) { - struct sdhci_host *host; + struct sdhci_host *host = mmc_priv(mmc); u16 ctrl; - u32 ier; int tuning_loop_counter = MAX_TUNING_LOOP; - unsigned long timeout; int err = 0; - bool requires_tuning_nonuhs = false; unsigned long flags; - host = mmc_priv(mmc); - sdhci_runtime_pm_get(host); spin_lock_irqsave(&host->lock, flags); - ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); - /* * The Host Controller needs tuning only in case of SDR104 mode * and for SDR50 mode when Use Tuning for SDR50 is set in the @@ -1878,15 +1836,18 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) * If the Host Controller supports the HS200 mode then the * tuning function has to be executed. */ - if (((ctrl & SDHCI_CTRL_UHS_MASK) == SDHCI_CTRL_UHS_SDR50) && - (host->flags & SDHCI_SDR50_NEEDS_TUNING || - host->flags & SDHCI_SDR104_NEEDS_TUNING)) - requires_tuning_nonuhs = true; - - if (((ctrl & SDHCI_CTRL_UHS_MASK) == SDHCI_CTRL_UHS_SDR104) || - requires_tuning_nonuhs) - ctrl |= SDHCI_CTRL_EXEC_TUNING; - else { + switch (host->timing) { + case MMC_TIMING_MMC_HS200: + case MMC_TIMING_UHS_SDR104: + break; + + case MMC_TIMING_UHS_SDR50: + if (host->flags & SDHCI_SDR50_NEEDS_TUNING || + host->flags & SDHCI_SDR104_NEEDS_TUNING) + break; + /* FALLTHROUGH */ + + default: spin_unlock_irqrestore(&host->lock, flags); sdhci_runtime_pm_put(host); return 0; @@ -1899,6 +1860,8 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) return err; } + ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl |= SDHCI_CTRL_EXEC_TUNING; sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); /* @@ -1911,21 +1874,17 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) * to make sure we don't hit a controller bug, we _only_ * enable Buffer Read Ready interrupt here. */ - ier = sdhci_readl(host, SDHCI_INT_ENABLE); - sdhci_clear_set_irqs(host, ier, SDHCI_INT_DATA_AVAIL); + sdhci_writel(host, SDHCI_INT_DATA_AVAIL, SDHCI_INT_ENABLE); + sdhci_writel(host, SDHCI_INT_DATA_AVAIL, SDHCI_SIGNAL_ENABLE); /* * Issue CMD19 repeatedly till Execute Tuning is set to 0 or the number * of loops reaches 40 times or a timeout of 150ms occurs. */ - timeout = 150; do { struct mmc_command cmd = {0}; struct mmc_request mrq = {NULL}; - if (!tuning_loop_counter && !timeout) - break; - cmd.opcode = opcode; cmd.arg = 0; cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC; @@ -1933,6 +1892,9 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) cmd.data = NULL; cmd.error = 0; + if (tuning_loop_counter-- == 0) + break; + mrq.cmd = &cmd; host->mrq = &mrq; @@ -1990,26 +1952,25 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode) host->tuning_done = 0; ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); - tuning_loop_counter--; - timeout--; - mdelay(1); + + /* eMMC spec does not require a delay between tuning cycles */ + if (opcode == MMC_SEND_TUNING_BLOCK) + mdelay(1); } while (ctrl & SDHCI_CTRL_EXEC_TUNING); /* * The Host Driver has exhausted the maximum number of loops allowed, * so use fixed sampling frequency. */ - if (!tuning_loop_counter || !timeout) { + if (tuning_loop_counter < 0) { ctrl &= ~SDHCI_CTRL_TUNED_CLK; sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); + } + if (!(ctrl & SDHCI_CTRL_TUNED_CLK)) { + pr_info(DRIVER_NAME ": Tuning procedure" + " failed, falling back to fixed sampling" + " clock\n"); err = -EIO; - } else { - if (!(ctrl & SDHCI_CTRL_TUNED_CLK)) { - pr_info(DRIVER_NAME ": Tuning procedure" - " failed, falling back to fixed sampling" - " clock\n"); - err = -EIO; - } } out: @@ -2044,7 +2005,8 @@ out: if (err && (host->flags & SDHCI_USING_RETUNING_TIMER)) err = 0; - sdhci_clear_set_irqs(host, SDHCI_INT_DATA_AVAIL, ier); + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); spin_unlock_irqrestore(&host->lock, flags); sdhci_runtime_pm_put(host); @@ -2054,26 +2016,30 @@ out: static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable) { - u16 ctrl; - /* Host Controller v3.00 defines preset value registers */ if (host->version < SDHCI_SPEC_300) return; - ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); - /* * We only enable or disable Preset Value if they are not already * enabled or disabled respectively. Otherwise, we bail out. */ - if (enable && !(ctrl & SDHCI_CTRL_PRESET_VAL_ENABLE)) { - ctrl |= SDHCI_CTRL_PRESET_VAL_ENABLE; - sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); - host->flags |= SDHCI_PV_ENABLED; - } else if (!enable && (ctrl & SDHCI_CTRL_PRESET_VAL_ENABLE)) { - ctrl &= ~SDHCI_CTRL_PRESET_VAL_ENABLE; + if (host->preset_enabled != enable) { + u16 ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + + if (enable) + ctrl |= SDHCI_CTRL_PRESET_VAL_ENABLE; + else + ctrl &= ~SDHCI_CTRL_PRESET_VAL_ENABLE; + sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); - host->flags &= ~SDHCI_PV_ENABLED; + + if (enable) + host->flags |= SDHCI_PV_ENABLED; + else + host->flags &= ~SDHCI_PV_ENABLED; + + host->preset_enabled = enable; } } @@ -2095,8 +2061,8 @@ static void sdhci_card_event(struct mmc_host *mmc) pr_err("%s: Resetting controller.\n", mmc_hostname(host->mmc)); - sdhci_reset(host, SDHCI_RESET_CMD); - sdhci_reset(host, SDHCI_RESET_DATA); + sdhci_do_reset(host, SDHCI_RESET_CMD); + sdhci_do_reset(host, SDHCI_RESET_DATA); host->mrq->cmd->error = -ENOMEDIUM; tasklet_schedule(&host->finish_tasklet); @@ -2124,15 +2090,6 @@ static const struct mmc_host_ops sdhci_ops = { * * \*****************************************************************************/ -static void sdhci_tasklet_card(unsigned long param) -{ - struct sdhci_host *host = (struct sdhci_host*)param; - - sdhci_card_event(host->mmc); - - mmc_detect_change(host->mmc, msecs_to_jiffies(200)); -} - static void sdhci_tasklet_finish(unsigned long param) { struct sdhci_host *host; @@ -2169,12 +2126,12 @@ static void sdhci_tasklet_finish(unsigned long param) /* Some controllers need this kick or reset won't work here */ if (host->quirks & SDHCI_QUIRK_CLOCK_BEFORE_RESET) /* This is to force an update */ - sdhci_update_clock(host); + host->ops->set_clock(host, host->clock); /* Spec says we should do both at the same time, but Ricoh controllers do not like that. */ - sdhci_reset(host, SDHCI_RESET_CMD); - sdhci_reset(host, SDHCI_RESET_DATA); + sdhci_do_reset(host, SDHCI_RESET_CMD); + sdhci_do_reset(host, SDHCI_RESET_DATA); } host->mrq = NULL; @@ -2424,101 +2381,94 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) static irqreturn_t sdhci_irq(int irq, void *dev_id) { - irqreturn_t result; + irqreturn_t result = IRQ_NONE; struct sdhci_host *host = dev_id; - u32 intmask, unexpected = 0; - int cardint = 0, max_loops = 16; + u32 intmask, mask, unexpected = 0; + int max_loops = 16; spin_lock(&host->lock); - if (host->runtime_suspended) { + if (host->runtime_suspended && !sdhci_sdio_irq_enabled(host)) { spin_unlock(&host->lock); return IRQ_NONE; } intmask = sdhci_readl(host, SDHCI_INT_STATUS); - if (!intmask || intmask == 0xffffffff) { result = IRQ_NONE; goto out; } -again: - DBG("*** %s got interrupt: 0x%08x\n", - mmc_hostname(host->mmc), intmask); - - if (intmask & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) { - u32 present = sdhci_readl(host, SDHCI_PRESENT_STATE) & - SDHCI_CARD_PRESENT; - - /* - * There is a observation on i.mx esdhc. INSERT bit will be - * immediately set again when it gets cleared, if a card is - * inserted. We have to mask the irq to prevent interrupt - * storm which will freeze the system. And the REMOVE gets - * the same situation. - * - * More testing are needed here to ensure it works for other - * platforms though. - */ - sdhci_mask_irqs(host, present ? SDHCI_INT_CARD_INSERT : - SDHCI_INT_CARD_REMOVE); - sdhci_unmask_irqs(host, present ? SDHCI_INT_CARD_REMOVE : - SDHCI_INT_CARD_INSERT); - - sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT | - SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS); - intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE); - tasklet_schedule(&host->card_tasklet); - } - - if (intmask & SDHCI_INT_CMD_MASK) { - sdhci_writel(host, intmask & SDHCI_INT_CMD_MASK, - SDHCI_INT_STATUS); - sdhci_cmd_irq(host, intmask & SDHCI_INT_CMD_MASK); - } + do { + /* Clear selected interrupts. */ + mask = intmask & (SDHCI_INT_CMD_MASK | SDHCI_INT_DATA_MASK | + SDHCI_INT_BUS_POWER); + sdhci_writel(host, mask, SDHCI_INT_STATUS); - if (intmask & SDHCI_INT_DATA_MASK) { - sdhci_writel(host, intmask & SDHCI_INT_DATA_MASK, - SDHCI_INT_STATUS); - sdhci_data_irq(host, intmask & SDHCI_INT_DATA_MASK); - } + DBG("*** %s got interrupt: 0x%08x\n", + mmc_hostname(host->mmc), intmask); - intmask &= ~(SDHCI_INT_CMD_MASK | SDHCI_INT_DATA_MASK); + if (intmask & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) { + u32 present = sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT; - intmask &= ~SDHCI_INT_ERROR; + /* + * There is a observation on i.mx esdhc. INSERT + * bit will be immediately set again when it gets + * cleared, if a card is inserted. We have to mask + * the irq to prevent interrupt storm which will + * freeze the system. And the REMOVE gets the + * same situation. + * + * More testing are needed here to ensure it works + * for other platforms though. + */ + host->ier &= ~(SDHCI_INT_CARD_INSERT | + SDHCI_INT_CARD_REMOVE); + host->ier |= present ? SDHCI_INT_CARD_REMOVE : + SDHCI_INT_CARD_INSERT; + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); + + sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT | + SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS); + + host->thread_isr |= intmask & (SDHCI_INT_CARD_INSERT | + SDHCI_INT_CARD_REMOVE); + result = IRQ_WAKE_THREAD; + } - if (intmask & SDHCI_INT_BUS_POWER) { - pr_err("%s: Card is consuming too much power!\n", - mmc_hostname(host->mmc)); - sdhci_writel(host, SDHCI_INT_BUS_POWER, SDHCI_INT_STATUS); - } + if (intmask & SDHCI_INT_CMD_MASK) + sdhci_cmd_irq(host, intmask & SDHCI_INT_CMD_MASK); - intmask &= ~SDHCI_INT_BUS_POWER; + if (intmask & SDHCI_INT_DATA_MASK) + sdhci_data_irq(host, intmask & SDHCI_INT_DATA_MASK); - if (intmask & SDHCI_INT_CARD_INT) - cardint = 1; + if (intmask & SDHCI_INT_BUS_POWER) + pr_err("%s: Card is consuming too much power!\n", + mmc_hostname(host->mmc)); - intmask &= ~SDHCI_INT_CARD_INT; + if (intmask & SDHCI_INT_CARD_INT) { + sdhci_enable_sdio_irq_nolock(host, false); + host->thread_isr |= SDHCI_INT_CARD_INT; + result = IRQ_WAKE_THREAD; + } - if (intmask) { - unexpected |= intmask; - sdhci_writel(host, intmask, SDHCI_INT_STATUS); - } + intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE | + SDHCI_INT_CMD_MASK | SDHCI_INT_DATA_MASK | + SDHCI_INT_ERROR | SDHCI_INT_BUS_POWER | + SDHCI_INT_CARD_INT); - result = IRQ_HANDLED; + if (intmask) { + unexpected |= intmask; + sdhci_writel(host, intmask, SDHCI_INT_STATUS); + } - intmask = sdhci_readl(host, SDHCI_INT_STATUS); + if (result == IRQ_NONE) + result = IRQ_HANDLED; - /* - * If we know we'll call the driver to signal SDIO IRQ, disregard - * further indications of Card Interrupt in the status to avoid a - * needless loop. - */ - if (cardint) - intmask &= ~SDHCI_INT_CARD_INT; - if (intmask && --max_loops) - goto again; + intmask = sdhci_readl(host, SDHCI_INT_STATUS); + } while (intmask && --max_loops); out: spin_unlock(&host->lock); @@ -2527,15 +2477,38 @@ out: mmc_hostname(host->mmc), unexpected); sdhci_dumpregs(host); } - /* - * We have to delay this as it calls back into the driver. - */ - if (cardint) - mmc_signal_sdio_irq(host->mmc); return result; } +static irqreturn_t sdhci_thread_irq(int irq, void *dev_id) +{ + struct sdhci_host *host = dev_id; + unsigned long flags; + u32 isr; + + spin_lock_irqsave(&host->lock, flags); + isr = host->thread_isr; + host->thread_isr = 0; + spin_unlock_irqrestore(&host->lock, flags); + + if (isr & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) { + sdhci_card_event(host->mmc); + mmc_detect_change(host->mmc, msecs_to_jiffies(200)); + } + + if (isr & SDHCI_INT_CARD_INT) { + sdio_run_irqs(host->mmc); + + spin_lock_irqsave(&host->lock, flags); + if (host->flags & SDHCI_SDIO_IRQ_ENABLED) + sdhci_enable_sdio_irq_nolock(host, true); + spin_unlock_irqrestore(&host->lock, flags); + } + + return isr ? IRQ_HANDLED : IRQ_NONE; +} + /*****************************************************************************\ * * * Suspend/resume * @@ -2572,9 +2545,6 @@ EXPORT_SYMBOL_GPL(sdhci_disable_irq_wakeups); int sdhci_suspend_host(struct sdhci_host *host) { - if (host->ops->platform_suspend) - host->ops->platform_suspend(host); - sdhci_disable_card_detection(host); /* Disable tuning since we are suspending */ @@ -2584,7 +2554,9 @@ int sdhci_suspend_host(struct sdhci_host *host) } if (!device_may_wakeup(mmc_dev(host->mmc))) { - sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK); + host->ier = 0; + sdhci_writel(host, 0, SDHCI_INT_ENABLE); + sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE); free_irq(host->irq, host); } else { sdhci_enable_irq_wakeups(host); @@ -2605,8 +2577,9 @@ int sdhci_resume_host(struct sdhci_host *host) } if (!device_may_wakeup(mmc_dev(host->mmc))) { - ret = request_irq(host->irq, sdhci_irq, IRQF_SHARED, - mmc_hostname(host->mmc), host); + ret = request_threaded_irq(host->irq, sdhci_irq, + sdhci_thread_irq, IRQF_SHARED, + mmc_hostname(host->mmc), host); if (ret) return ret; } else { @@ -2628,9 +2601,6 @@ int sdhci_resume_host(struct sdhci_host *host) sdhci_enable_card_detection(host); - if (host->ops->platform_resume) - host->ops->platform_resume(host); - /* Set the re-tuning expiration flag */ if (host->flags & SDHCI_USING_RETUNING_TIMER) host->flags |= SDHCI_NEEDS_RETUNING; @@ -2682,10 +2652,12 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host) } spin_lock_irqsave(&host->lock, flags); - sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK); + host->ier &= SDHCI_INT_CARD_INT; + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); + sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); spin_unlock_irqrestore(&host->lock, flags); - synchronize_irq(host->irq); + synchronize_hardirq(host->irq); spin_lock_irqsave(&host->lock, flags); host->runtime_suspended = true; @@ -2729,7 +2701,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host) host->runtime_suspended = false; /* Enable SDIO IRQ */ - if ((host->flags & SDHCI_SDIO_IRQ_ENABLED)) + if (host->flags & SDHCI_SDIO_IRQ_ENABLED) sdhci_enable_sdio_irq_nolock(host, true); /* Enable Card Detection */ @@ -2788,7 +2760,7 @@ int sdhci_add_host(struct sdhci_host *host) if (debug_quirks2) host->quirks2 = debug_quirks2; - sdhci_reset(host, SDHCI_RESET_ALL); + sdhci_do_reset(host, SDHCI_RESET_ALL); host->version = sdhci_readw(host, SDHCI_HOST_VERSION); host->version = (host->version & SDHCI_SPEC_VER_MASK) @@ -2848,15 +2820,29 @@ int sdhci_add_host(struct sdhci_host *host) * (128) and potentially one alignment transfer for * each of those entries. */ - host->adma_desc = kmalloc((128 * 2 + 1) * 4, GFP_KERNEL); + host->adma_desc = dma_alloc_coherent(mmc_dev(host->mmc), + ADMA_SIZE, &host->adma_addr, + GFP_KERNEL); host->align_buffer = kmalloc(128 * 4, GFP_KERNEL); if (!host->adma_desc || !host->align_buffer) { - kfree(host->adma_desc); + dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + host->adma_desc, host->adma_addr); kfree(host->align_buffer); pr_warning("%s: Unable to allocate ADMA " "buffers. Falling back to standard DMA.\n", mmc_hostname(mmc)); host->flags &= ~SDHCI_USE_ADMA; + host->adma_desc = NULL; + host->align_buffer = NULL; + } else if (host->adma_addr & 3) { + pr_warning("%s: unable to allocate aligned ADMA descriptor\n", + mmc_hostname(mmc)); + host->flags &= ~SDHCI_USE_ADMA; + dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + host->adma_desc, host->adma_addr); + kfree(host->align_buffer); + host->adma_desc = NULL; + host->align_buffer = NULL; } } @@ -2941,6 +2927,7 @@ int sdhci_add_host(struct sdhci_host *host) mmc->max_busy_timeout = (1 << 27) / host->timeout_clk; mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_ERASE | MMC_CAP_CMD23; + mmc->caps2 |= MMC_CAP2_SDIO_IRQ_NOTHREAD; if (host->quirks & SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12) host->flags |= SDHCI_AUTO_CMD12; @@ -3212,8 +3199,6 @@ int sdhci_add_host(struct sdhci_host *host) /* * Init tasklets. */ - tasklet_init(&host->card_tasklet, - sdhci_tasklet_card, (unsigned long)host); tasklet_init(&host->finish_tasklet, sdhci_tasklet_finish, (unsigned long)host); @@ -3230,8 +3215,8 @@ int sdhci_add_host(struct sdhci_host *host) sdhci_init(host, 0); - ret = request_irq(host->irq, sdhci_irq, IRQF_SHARED, - mmc_hostname(mmc), host); + ret = request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq, + IRQF_SHARED, mmc_hostname(mmc), host); if (ret) { pr_err("%s: Failed to request IRQ %d: %d\n", mmc_hostname(mmc), host->irq, ret); @@ -3273,12 +3258,12 @@ int sdhci_add_host(struct sdhci_host *host) #ifdef SDHCI_USE_LEDS_CLASS reset: - sdhci_reset(host, SDHCI_RESET_ALL); - sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK); + sdhci_do_reset(host, SDHCI_RESET_ALL); + sdhci_writel(host, 0, SDHCI_INT_ENABLE); + sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE); free_irq(host->irq, host); #endif untasklet: - tasklet_kill(&host->card_tasklet); tasklet_kill(&host->finish_tasklet); return ret; @@ -3315,14 +3300,14 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) #endif if (!dead) - sdhci_reset(host, SDHCI_RESET_ALL); + sdhci_do_reset(host, SDHCI_RESET_ALL); - sdhci_mask_irqs(host, SDHCI_INT_ALL_MASK); + sdhci_writel(host, 0, SDHCI_INT_ENABLE); + sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE); free_irq(host->irq, host); del_timer_sync(&host->timer); - tasklet_kill(&host->card_tasklet); tasklet_kill(&host->finish_tasklet); if (host->vmmc) { @@ -3335,7 +3320,9 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) regulator_put(host->vqmmc); } - kfree(host->adma_desc); + if (host->adma_desc) + dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + host->adma_desc, host->adma_addr); kfree(host->align_buffer); host->adma_desc = NULL; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 0a3ed01887db..4a5cd5e3fa3e 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -281,18 +281,14 @@ struct sdhci_ops { unsigned int (*get_max_clock)(struct sdhci_host *host); unsigned int (*get_min_clock)(struct sdhci_host *host); unsigned int (*get_timeout_clock)(struct sdhci_host *host); - int (*platform_bus_width)(struct sdhci_host *host, - int width); + void (*set_bus_width)(struct sdhci_host *host, int width); void (*platform_send_init_74_clocks)(struct sdhci_host *host, u8 power_mode); unsigned int (*get_ro)(struct sdhci_host *host); - void (*platform_reset_enter)(struct sdhci_host *host, u8 mask); - void (*platform_reset_exit)(struct sdhci_host *host, u8 mask); + void (*reset)(struct sdhci_host *host, u8 mask); int (*platform_execute_tuning)(struct sdhci_host *host, u32 opcode); - int (*set_uhs_signaling)(struct sdhci_host *host, unsigned int uhs); + void (*set_uhs_signaling)(struct sdhci_host *host, unsigned int uhs); void (*hw_reset)(struct sdhci_host *host); - void (*platform_suspend)(struct sdhci_host *host); - void (*platform_resume)(struct sdhci_host *host); void (*adma_workaround)(struct sdhci_host *host, u32 intmask); void (*platform_init)(struct sdhci_host *host); void (*card_event)(struct sdhci_host *host); @@ -397,6 +393,16 @@ extern void sdhci_remove_host(struct sdhci_host *host, int dead); extern void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd); +static inline bool sdhci_sdio_irq_enabled(struct sdhci_host *host) +{ + return !!(host->flags & SDHCI_SDIO_IRQ_ENABLED); +} + +void sdhci_set_clock(struct sdhci_host *host, unsigned int clock); +void sdhci_set_bus_width(struct sdhci_host *host, int width); +void sdhci_reset(struct sdhci_host *host, u8 mask); +void sdhci_set_uhs_signaling(struct sdhci_host *host, unsigned timing); + #ifdef CONFIG_PM extern int sdhci_suspend_host(struct sdhci_host *host); extern int sdhci_resume_host(struct sdhci_host *host); diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 54730f4aac87..656fbba4c422 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -803,12 +803,13 @@ static u32 sh_mmcif_set_cmd(struct sh_mmcif_host *host, break; } switch (host->timing) { - case MMC_TIMING_UHS_DDR50: + case MMC_TIMING_MMC_DDR52: /* * MMC core will only set this timing, if the host - * advertises the MMC_CAP_UHS_DDR50 capability. MMCIF - * implementations with this capability, e.g. sh73a0, - * will have to set it in their platform data. + * advertises the MMC_CAP_1_8V_DDR/MMC_CAP_1_2V_DDR + * capability. MMCIF implementations with this + * capability, e.g. sh73a0, will have to set it + * in their platform data. */ tmp |= CMD_SET_DARS; break; diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c new file mode 100644 index 000000000000..eb2bbbef19c6 --- /dev/null +++ b/drivers/mmc/host/usdhi6rol0.c @@ -0,0 +1,1847 @@ +/* + * Copyright (C) 2013-2014 Renesas Electronics Europe Ltd. + * Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + */ + +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> +#include <linux/highmem.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/log2.h> +#include <linux/mmc/host.h> +#include <linux/mmc/mmc.h> +#include <linux/mmc/sd.h> +#include <linux/mmc/sdio.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/platform_device.h> +#include <linux/scatterlist.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/virtio.h> +#include <linux/workqueue.h> + +#define USDHI6_SD_CMD 0x0000 +#define USDHI6_SD_PORT_SEL 0x0004 +#define USDHI6_SD_ARG 0x0008 +#define USDHI6_SD_STOP 0x0010 +#define USDHI6_SD_SECCNT 0x0014 +#define USDHI6_SD_RSP10 0x0018 +#define USDHI6_SD_RSP32 0x0020 +#define USDHI6_SD_RSP54 0x0028 +#define USDHI6_SD_RSP76 0x0030 +#define USDHI6_SD_INFO1 0x0038 +#define USDHI6_SD_INFO2 0x003c +#define USDHI6_SD_INFO1_MASK 0x0040 +#define USDHI6_SD_INFO2_MASK 0x0044 +#define USDHI6_SD_CLK_CTRL 0x0048 +#define USDHI6_SD_SIZE 0x004c +#define USDHI6_SD_OPTION 0x0050 +#define USDHI6_SD_ERR_STS1 0x0058 +#define USDHI6_SD_ERR_STS2 0x005c +#define USDHI6_SD_BUF0 0x0060 +#define USDHI6_SDIO_MODE 0x0068 +#define USDHI6_SDIO_INFO1 0x006c +#define USDHI6_SDIO_INFO1_MASK 0x0070 +#define USDHI6_CC_EXT_MODE 0x01b0 +#define USDHI6_SOFT_RST 0x01c0 +#define USDHI6_VERSION 0x01c4 +#define USDHI6_HOST_MODE 0x01c8 +#define USDHI6_SDIF_MODE 0x01cc + +#define USDHI6_SD_CMD_APP 0x0040 +#define USDHI6_SD_CMD_MODE_RSP_AUTO 0x0000 +#define USDHI6_SD_CMD_MODE_RSP_NONE 0x0300 +#define USDHI6_SD_CMD_MODE_RSP_R1 0x0400 /* Also R5, R6, R7 */ +#define USDHI6_SD_CMD_MODE_RSP_R1B 0x0500 /* R1b */ +#define USDHI6_SD_CMD_MODE_RSP_R2 0x0600 +#define USDHI6_SD_CMD_MODE_RSP_R3 0x0700 /* Also R4 */ +#define USDHI6_SD_CMD_DATA 0x0800 +#define USDHI6_SD_CMD_READ 0x1000 +#define USDHI6_SD_CMD_MULTI 0x2000 +#define USDHI6_SD_CMD_CMD12_AUTO_OFF 0x4000 + +#define USDHI6_CC_EXT_MODE_SDRW BIT(1) + +#define USDHI6_SD_INFO1_RSP_END BIT(0) +#define USDHI6_SD_INFO1_ACCESS_END BIT(2) +#define USDHI6_SD_INFO1_CARD_OUT BIT(3) +#define USDHI6_SD_INFO1_CARD_IN BIT(4) +#define USDHI6_SD_INFO1_CD BIT(5) +#define USDHI6_SD_INFO1_WP BIT(7) +#define USDHI6_SD_INFO1_D3_CARD_OUT BIT(8) +#define USDHI6_SD_INFO1_D3_CARD_IN BIT(9) + +#define USDHI6_SD_INFO2_CMD_ERR BIT(0) +#define USDHI6_SD_INFO2_CRC_ERR BIT(1) +#define USDHI6_SD_INFO2_END_ERR BIT(2) +#define USDHI6_SD_INFO2_TOUT BIT(3) +#define USDHI6_SD_INFO2_IWA_ERR BIT(4) +#define USDHI6_SD_INFO2_IRA_ERR BIT(5) +#define USDHI6_SD_INFO2_RSP_TOUT BIT(6) +#define USDHI6_SD_INFO2_SDDAT0 BIT(7) +#define USDHI6_SD_INFO2_BRE BIT(8) +#define USDHI6_SD_INFO2_BWE BIT(9) +#define USDHI6_SD_INFO2_SCLKDIVEN BIT(13) +#define USDHI6_SD_INFO2_CBSY BIT(14) +#define USDHI6_SD_INFO2_ILA BIT(15) + +#define USDHI6_SD_INFO1_CARD_INSERT (USDHI6_SD_INFO1_CARD_IN | USDHI6_SD_INFO1_D3_CARD_IN) +#define USDHI6_SD_INFO1_CARD_EJECT (USDHI6_SD_INFO1_CARD_OUT | USDHI6_SD_INFO1_D3_CARD_OUT) +#define USDHI6_SD_INFO1_CARD (USDHI6_SD_INFO1_CARD_INSERT | USDHI6_SD_INFO1_CARD_EJECT) +#define USDHI6_SD_INFO1_CARD_CD (USDHI6_SD_INFO1_CARD_IN | USDHI6_SD_INFO1_CARD_OUT) + +#define USDHI6_SD_INFO2_ERR (USDHI6_SD_INFO2_CMD_ERR | \ + USDHI6_SD_INFO2_CRC_ERR | USDHI6_SD_INFO2_END_ERR | \ + USDHI6_SD_INFO2_TOUT | USDHI6_SD_INFO2_IWA_ERR | \ + USDHI6_SD_INFO2_IRA_ERR | USDHI6_SD_INFO2_RSP_TOUT | \ + USDHI6_SD_INFO2_ILA) + +#define USDHI6_SD_INFO1_IRQ (USDHI6_SD_INFO1_RSP_END | USDHI6_SD_INFO1_ACCESS_END | \ + USDHI6_SD_INFO1_CARD) + +#define USDHI6_SD_INFO2_IRQ (USDHI6_SD_INFO2_ERR | USDHI6_SD_INFO2_BRE | \ + USDHI6_SD_INFO2_BWE | 0x0800 | USDHI6_SD_INFO2_ILA) + +#define USDHI6_SD_CLK_CTRL_SCLKEN BIT(8) + +#define USDHI6_SD_STOP_STP BIT(0) +#define USDHI6_SD_STOP_SEC BIT(8) + +#define USDHI6_SDIO_INFO1_IOIRQ BIT(0) +#define USDHI6_SDIO_INFO1_EXPUB52 BIT(14) +#define USDHI6_SDIO_INFO1_EXWT BIT(15) + +#define USDHI6_SD_ERR_STS1_CRC_NO_ERROR BIT(13) + +#define USDHI6_SOFT_RST_RESERVED (BIT(1) | BIT(2)) +#define USDHI6_SOFT_RST_RESET BIT(0) + +#define USDHI6_SD_OPTION_TIMEOUT_SHIFT 4 +#define USDHI6_SD_OPTION_TIMEOUT_MASK (0xf << USDHI6_SD_OPTION_TIMEOUT_SHIFT) +#define USDHI6_SD_OPTION_WIDTH_1 BIT(15) + +#define USDHI6_SD_PORT_SEL_PORTS_SHIFT 8 + +#define USDHI6_SD_CLK_CTRL_DIV_MASK 0xff + +#define USDHI6_SDIO_INFO1_IRQ (USDHI6_SDIO_INFO1_IOIRQ | 3 | \ + USDHI6_SDIO_INFO1_EXPUB52 | USDHI6_SDIO_INFO1_EXWT) + +#define USDHI6_MIN_DMA 64 + +enum usdhi6_wait_for { + USDHI6_WAIT_FOR_REQUEST, + USDHI6_WAIT_FOR_CMD, + USDHI6_WAIT_FOR_MREAD, + USDHI6_WAIT_FOR_MWRITE, + USDHI6_WAIT_FOR_READ, + USDHI6_WAIT_FOR_WRITE, + USDHI6_WAIT_FOR_DATA_END, + USDHI6_WAIT_FOR_STOP, + USDHI6_WAIT_FOR_DMA, +}; + +struct usdhi6_page { + struct page *page; + void *mapped; /* mapped page */ +}; + +struct usdhi6_host { + struct mmc_host *mmc; + struct mmc_request *mrq; + void __iomem *base; + struct clk *clk; + + /* SG memory handling */ + + /* Common for multiple and single block requests */ + struct usdhi6_page pg; /* current page from an SG */ + void *blk_page; /* either a mapped page, or the bounce buffer */ + size_t offset; /* offset within a page, including sg->offset */ + + /* Blocks, crossing a page boundary */ + size_t head_len; + struct usdhi6_page head_pg; + + /* A bounce buffer for unaligned blocks or blocks, crossing a page boundary */ + struct scatterlist bounce_sg; + u8 bounce_buf[512]; + + /* Multiple block requests only */ + struct scatterlist *sg; /* current SG segment */ + int page_idx; /* page index within an SG segment */ + + enum usdhi6_wait_for wait; + u32 status_mask; + u32 status2_mask; + u32 sdio_mask; + u32 io_error; + u32 irq_status; + unsigned long imclk; + unsigned long rate; + bool app_cmd; + + /* Timeout handling */ + struct delayed_work timeout_work; + unsigned long timeout; + + /* DMA support */ + struct dma_chan *chan_rx; + struct dma_chan *chan_tx; + bool dma_active; +}; + +/* I/O primitives */ + +static void usdhi6_write(struct usdhi6_host *host, u32 reg, u32 data) +{ + iowrite32(data, host->base + reg); + dev_vdbg(mmc_dev(host->mmc), "%s(0x%p + 0x%x) = 0x%x\n", __func__, + host->base, reg, data); +} + +static void usdhi6_write16(struct usdhi6_host *host, u32 reg, u16 data) +{ + iowrite16(data, host->base + reg); + dev_vdbg(mmc_dev(host->mmc), "%s(0x%p + 0x%x) = 0x%x\n", __func__, + host->base, reg, data); +} + +static u32 usdhi6_read(struct usdhi6_host *host, u32 reg) +{ + u32 data = ioread32(host->base + reg); + dev_vdbg(mmc_dev(host->mmc), "%s(0x%p + 0x%x) = 0x%x\n", __func__, + host->base, reg, data); + return data; +} + +static u16 usdhi6_read16(struct usdhi6_host *host, u32 reg) +{ + u16 data = ioread16(host->base + reg); + dev_vdbg(mmc_dev(host->mmc), "%s(0x%p + 0x%x) = 0x%x\n", __func__, + host->base, reg, data); + return data; +} + +static void usdhi6_irq_enable(struct usdhi6_host *host, u32 info1, u32 info2) +{ + host->status_mask = USDHI6_SD_INFO1_IRQ & ~info1; + host->status2_mask = USDHI6_SD_INFO2_IRQ & ~info2; + usdhi6_write(host, USDHI6_SD_INFO1_MASK, host->status_mask); + usdhi6_write(host, USDHI6_SD_INFO2_MASK, host->status2_mask); +} + +static void usdhi6_wait_for_resp(struct usdhi6_host *host) +{ + usdhi6_irq_enable(host, USDHI6_SD_INFO1_RSP_END | + USDHI6_SD_INFO1_ACCESS_END | USDHI6_SD_INFO1_CARD_CD, + USDHI6_SD_INFO2_ERR); +} + +static void usdhi6_wait_for_brwe(struct usdhi6_host *host, bool read) +{ + usdhi6_irq_enable(host, USDHI6_SD_INFO1_ACCESS_END | + USDHI6_SD_INFO1_CARD_CD, USDHI6_SD_INFO2_ERR | + (read ? USDHI6_SD_INFO2_BRE : USDHI6_SD_INFO2_BWE)); +} + +static void usdhi6_only_cd(struct usdhi6_host *host) +{ + /* Mask all except card hotplug */ + usdhi6_irq_enable(host, USDHI6_SD_INFO1_CARD_CD, 0); +} + +static void usdhi6_mask_all(struct usdhi6_host *host) +{ + usdhi6_irq_enable(host, 0, 0); +} + +static int usdhi6_error_code(struct usdhi6_host *host) +{ + u32 err; + + usdhi6_write(host, USDHI6_SD_STOP, USDHI6_SD_STOP_STP); + + if (host->io_error & + (USDHI6_SD_INFO2_RSP_TOUT | USDHI6_SD_INFO2_TOUT)) { + u32 rsp54 = usdhi6_read(host, USDHI6_SD_RSP54); + int opc = host->mrq ? host->mrq->cmd->opcode : -1; + + err = usdhi6_read(host, USDHI6_SD_ERR_STS2); + /* Response timeout is often normal, don't spam the log */ + if (host->wait == USDHI6_WAIT_FOR_CMD) + dev_dbg(mmc_dev(host->mmc), + "T-out sts 0x%x, resp 0x%x, state %u, CMD%d\n", + err, rsp54, host->wait, opc); + else + dev_warn(mmc_dev(host->mmc), + "T-out sts 0x%x, resp 0x%x, state %u, CMD%d\n", + err, rsp54, host->wait, opc); + return -ETIMEDOUT; + } + + err = usdhi6_read(host, USDHI6_SD_ERR_STS1); + if (err != USDHI6_SD_ERR_STS1_CRC_NO_ERROR) + dev_warn(mmc_dev(host->mmc), "Err sts 0x%x, state %u, CMD%d\n", + err, host->wait, host->mrq ? host->mrq->cmd->opcode : -1); + if (host->io_error & USDHI6_SD_INFO2_ILA) + return -EILSEQ; + + return -EIO; +} + +/* Scatter-Gather management */ + +/* + * In PIO mode we have to map each page separately, using kmap(). That way + * adjacent pages are mapped to non-adjacent virtual addresses. That's why we + * have to use a bounce buffer for blocks, crossing page boundaries. Such blocks + * have been observed with an SDIO WiFi card (b43 driver). + */ +static void usdhi6_blk_bounce(struct usdhi6_host *host, + struct scatterlist *sg) +{ + struct mmc_data *data = host->mrq->data; + size_t blk_head = host->head_len; + + dev_dbg(mmc_dev(host->mmc), "%s(): CMD%u of %u SG: %ux%u @ 0x%x\n", + __func__, host->mrq->cmd->opcode, data->sg_len, + data->blksz, data->blocks, sg->offset); + + host->head_pg.page = host->pg.page; + host->head_pg.mapped = host->pg.mapped; + host->pg.page = nth_page(host->pg.page, 1); + host->pg.mapped = kmap(host->pg.page); + + host->blk_page = host->bounce_buf; + host->offset = 0; + + if (data->flags & MMC_DATA_READ) + return; + + memcpy(host->bounce_buf, host->head_pg.mapped + PAGE_SIZE - blk_head, + blk_head); + memcpy(host->bounce_buf + blk_head, host->pg.mapped, + data->blksz - blk_head); +} + +/* Only called for multiple block IO */ +static void usdhi6_sg_prep(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + struct mmc_data *data = mrq->data; + + usdhi6_write(host, USDHI6_SD_SECCNT, data->blocks); + + host->sg = data->sg; + /* TODO: if we always map, this is redundant */ + host->offset = host->sg->offset; +} + +/* Map the first page in an SG segment: common for multiple and single block IO */ +static void *usdhi6_sg_map(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + struct scatterlist *sg = data->sg_len > 1 ? host->sg : data->sg; + size_t head = PAGE_SIZE - sg->offset; + size_t blk_head = head % data->blksz; + + WARN(host->pg.page, "%p not properly unmapped!\n", host->pg.page); + if (WARN(sg_dma_len(sg) % data->blksz, + "SG size %zd isn't a multiple of block size %zd\n", + sg_dma_len(sg), data->blksz)) + return NULL; + + host->pg.page = sg_page(sg); + host->pg.mapped = kmap(host->pg.page); + host->offset = sg->offset; + + /* + * Block size must be a power of 2 for multi-block transfers, + * therefore blk_head is equal for all pages in this SG + */ + host->head_len = blk_head; + + if (head < data->blksz) + /* + * The first block in the SG crosses a page boundary. + * Max blksz = 512, so blocks can only span 2 pages + */ + usdhi6_blk_bounce(host, sg); + else + host->blk_page = host->pg.mapped; + + dev_dbg(mmc_dev(host->mmc), "Mapped %p (%lx) at %p + %u for CMD%u @ 0x%p\n", + host->pg.page, page_to_pfn(host->pg.page), host->pg.mapped, + sg->offset, host->mrq->cmd->opcode, host->mrq); + + return host->blk_page + host->offset; +} + +/* Unmap the current page: common for multiple and single block IO */ +static void usdhi6_sg_unmap(struct usdhi6_host *host, bool force) +{ + struct mmc_data *data = host->mrq->data; + struct page *page = host->head_pg.page; + + if (page) { + /* Previous block was cross-page boundary */ + struct scatterlist *sg = data->sg_len > 1 ? + host->sg : data->sg; + size_t blk_head = host->head_len; + + if (!data->error && data->flags & MMC_DATA_READ) { + memcpy(host->head_pg.mapped + PAGE_SIZE - blk_head, + host->bounce_buf, blk_head); + memcpy(host->pg.mapped, host->bounce_buf + blk_head, + data->blksz - blk_head); + } + + flush_dcache_page(page); + kunmap(page); + + host->head_pg.page = NULL; + + if (!force && sg_dma_len(sg) + sg->offset > + (host->page_idx << PAGE_SHIFT) + data->blksz - blk_head) + /* More blocks in this SG, don't unmap the next page */ + return; + } + + page = host->pg.page; + if (!page) + return; + + flush_dcache_page(page); + kunmap(page); + + host->pg.page = NULL; +} + +/* Called from MMC_WRITE_MULTIPLE_BLOCK or MMC_READ_MULTIPLE_BLOCK */ +static void usdhi6_sg_advance(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + size_t done, total; + + /* New offset: set at the end of the previous block */ + if (host->head_pg.page) { + /* Finished a cross-page block, jump to the new page */ + host->page_idx++; + host->offset = data->blksz - host->head_len; + host->blk_page = host->pg.mapped; + usdhi6_sg_unmap(host, false); + } else { + host->offset += data->blksz; + /* The completed block didn't cross a page boundary */ + if (host->offset == PAGE_SIZE) { + /* If required, we'll map the page below */ + host->offset = 0; + host->page_idx++; + } + } + + /* + * Now host->blk_page + host->offset point at the end of our last block + * and host->page_idx is the index of the page, in which our new block + * is located, if any + */ + + done = (host->page_idx << PAGE_SHIFT) + host->offset; + total = host->sg->offset + sg_dma_len(host->sg); + + dev_dbg(mmc_dev(host->mmc), "%s(): %zu of %zu @ %u\n", __func__, + done, total, host->offset); + + if (done < total && host->offset) { + /* More blocks in this page */ + if (host->offset + data->blksz > PAGE_SIZE) + /* We approached at a block, that spans 2 pages */ + usdhi6_blk_bounce(host, host->sg); + + return; + } + + /* Finished current page or an SG segment */ + usdhi6_sg_unmap(host, false); + + if (done == total) { + /* + * End of an SG segment or the complete SG: jump to the next + * segment, we'll map it later in usdhi6_blk_read() or + * usdhi6_blk_write() + */ + struct scatterlist *next = sg_next(host->sg); + + host->page_idx = 0; + + if (!next) + host->wait = USDHI6_WAIT_FOR_DATA_END; + host->sg = next; + + if (WARN(next && sg_dma_len(next) % data->blksz, + "SG size %zd isn't a multiple of block size %zd\n", + sg_dma_len(next), data->blksz)) + data->error = -EINVAL; + + return; + } + + /* We cannot get here after crossing a page border */ + + /* Next page in the same SG */ + host->pg.page = nth_page(sg_page(host->sg), host->page_idx); + host->pg.mapped = kmap(host->pg.page); + host->blk_page = host->pg.mapped; + + dev_dbg(mmc_dev(host->mmc), "Mapped %p (%lx) at %p for CMD%u @ 0x%p\n", + host->pg.page, page_to_pfn(host->pg.page), host->pg.mapped, + host->mrq->cmd->opcode, host->mrq); +} + +/* DMA handling */ + +static void usdhi6_dma_release(struct usdhi6_host *host) +{ + host->dma_active = false; + if (host->chan_tx) { + struct dma_chan *chan = host->chan_tx; + host->chan_tx = NULL; + dma_release_channel(chan); + } + if (host->chan_rx) { + struct dma_chan *chan = host->chan_rx; + host->chan_rx = NULL; + dma_release_channel(chan); + } +} + +static void usdhi6_dma_stop_unmap(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + + if (!host->dma_active) + return; + + usdhi6_write(host, USDHI6_CC_EXT_MODE, 0); + host->dma_active = false; + + if (data->flags & MMC_DATA_READ) + dma_unmap_sg(host->chan_rx->device->dev, data->sg, + data->sg_len, DMA_FROM_DEVICE); + else + dma_unmap_sg(host->chan_tx->device->dev, data->sg, + data->sg_len, DMA_TO_DEVICE); +} + +static void usdhi6_dma_complete(void *arg) +{ + struct usdhi6_host *host = arg; + struct mmc_request *mrq = host->mrq; + + if (WARN(!mrq || !mrq->data, "%s: NULL data in DMA completion for %p!\n", + dev_name(mmc_dev(host->mmc)), mrq)) + return; + + dev_dbg(mmc_dev(host->mmc), "%s(): CMD%u DMA completed\n", __func__, + mrq->cmd->opcode); + + usdhi6_dma_stop_unmap(host); + usdhi6_wait_for_brwe(host, mrq->data->flags & MMC_DATA_READ); +} + +static int usdhi6_dma_setup(struct usdhi6_host *host, struct dma_chan *chan, + enum dma_transfer_direction dir) +{ + struct mmc_data *data = host->mrq->data; + struct scatterlist *sg = data->sg; + struct dma_async_tx_descriptor *desc = NULL; + dma_cookie_t cookie = -EINVAL; + enum dma_data_direction data_dir; + int ret; + + switch (dir) { + case DMA_MEM_TO_DEV: + data_dir = DMA_TO_DEVICE; + break; + case DMA_DEV_TO_MEM: + data_dir = DMA_FROM_DEVICE; + break; + default: + return -EINVAL; + } + + ret = dma_map_sg(chan->device->dev, sg, data->sg_len, data_dir); + if (ret > 0) { + host->dma_active = true; + desc = dmaengine_prep_slave_sg(chan, sg, ret, dir, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + } + + if (desc) { + desc->callback = usdhi6_dma_complete; + desc->callback_param = host; + cookie = dmaengine_submit(desc); + } + + dev_dbg(mmc_dev(host->mmc), "%s(): mapped %d -> %d, cookie %d @ %p\n", + __func__, data->sg_len, ret, cookie, desc); + + if (cookie < 0) { + /* DMA failed, fall back to PIO */ + if (ret >= 0) + ret = cookie; + usdhi6_dma_release(host); + dev_warn(mmc_dev(host->mmc), + "DMA failed: %d, falling back to PIO\n", ret); + } + + return cookie; +} + +static int usdhi6_dma_start(struct usdhi6_host *host) +{ + if (!host->chan_rx || !host->chan_tx) + return -ENODEV; + + if (host->mrq->data->flags & MMC_DATA_READ) + return usdhi6_dma_setup(host, host->chan_rx, DMA_DEV_TO_MEM); + + return usdhi6_dma_setup(host, host->chan_tx, DMA_MEM_TO_DEV); +} + +static void usdhi6_dma_kill(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + + dev_dbg(mmc_dev(host->mmc), "%s(): SG of %u: %ux%u\n", + __func__, data->sg_len, data->blocks, data->blksz); + /* Abort DMA */ + if (data->flags & MMC_DATA_READ) + dmaengine_terminate_all(host->chan_rx); + else + dmaengine_terminate_all(host->chan_tx); +} + +static void usdhi6_dma_check_error(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + + dev_dbg(mmc_dev(host->mmc), "%s(): IO error %d, status 0x%x\n", + __func__, host->io_error, usdhi6_read(host, USDHI6_SD_INFO1)); + + if (host->io_error) { + data->error = usdhi6_error_code(host); + data->bytes_xfered = 0; + usdhi6_dma_kill(host); + usdhi6_dma_release(host); + dev_warn(mmc_dev(host->mmc), + "DMA failed: %d, falling back to PIO\n", data->error); + return; + } + + /* + * The datasheet tells us to check a response from the card, whereas + * responses only come after the command phase, not after the data + * phase. Let's check anyway. + */ + if (host->irq_status & USDHI6_SD_INFO1_RSP_END) + dev_warn(mmc_dev(host->mmc), "Unexpected response received!\n"); +} + +static void usdhi6_dma_kick(struct usdhi6_host *host) +{ + if (host->mrq->data->flags & MMC_DATA_READ) + dma_async_issue_pending(host->chan_rx); + else + dma_async_issue_pending(host->chan_tx); +} + +static void usdhi6_dma_request(struct usdhi6_host *host, phys_addr_t start) +{ + struct dma_slave_config cfg = { + .src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES, + .dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES, + }; + int ret; + + host->chan_tx = dma_request_slave_channel(mmc_dev(host->mmc), "tx"); + dev_dbg(mmc_dev(host->mmc), "%s: TX: got channel %p\n", __func__, + host->chan_tx); + + if (!host->chan_tx) + return; + + cfg.direction = DMA_MEM_TO_DEV; + cfg.dst_addr = start + USDHI6_SD_BUF0; + cfg.dst_maxburst = 128; /* 128 words * 4 bytes = 512 bytes */ + cfg.src_addr = 0; + ret = dmaengine_slave_config(host->chan_tx, &cfg); + if (ret < 0) + goto e_release_tx; + + host->chan_rx = dma_request_slave_channel(mmc_dev(host->mmc), "rx"); + dev_dbg(mmc_dev(host->mmc), "%s: RX: got channel %p\n", __func__, + host->chan_rx); + + if (!host->chan_rx) + goto e_release_tx; + + cfg.direction = DMA_DEV_TO_MEM; + cfg.src_addr = cfg.dst_addr; + cfg.src_maxburst = 128; /* 128 words * 4 bytes = 512 bytes */ + cfg.dst_addr = 0; + ret = dmaengine_slave_config(host->chan_rx, &cfg); + if (ret < 0) + goto e_release_rx; + + return; + +e_release_rx: + dma_release_channel(host->chan_rx); + host->chan_rx = NULL; +e_release_tx: + dma_release_channel(host->chan_tx); + host->chan_tx = NULL; +} + +/* API helpers */ + +static void usdhi6_clk_set(struct usdhi6_host *host, struct mmc_ios *ios) +{ + unsigned long rate = ios->clock; + u32 val; + unsigned int i; + + for (i = 1000; i; i--) { + if (usdhi6_read(host, USDHI6_SD_INFO2) & USDHI6_SD_INFO2_SCLKDIVEN) + break; + usleep_range(10, 100); + } + + if (!i) { + dev_err(mmc_dev(host->mmc), "SD bus busy, clock set aborted\n"); + return; + } + + val = usdhi6_read(host, USDHI6_SD_CLK_CTRL) & ~USDHI6_SD_CLK_CTRL_DIV_MASK; + + if (rate) { + unsigned long new_rate; + + if (host->imclk <= rate) { + if (ios->timing != MMC_TIMING_UHS_DDR50) { + /* Cannot have 1-to-1 clock in DDR mode */ + new_rate = host->imclk; + val |= 0xff; + } else { + new_rate = host->imclk / 2; + } + } else { + unsigned long div = + roundup_pow_of_two(DIV_ROUND_UP(host->imclk, rate)); + val |= div >> 2; + new_rate = host->imclk / div; + } + + if (host->rate == new_rate) + return; + + host->rate = new_rate; + + dev_dbg(mmc_dev(host->mmc), "target %lu, div %u, set %lu\n", + rate, (val & 0xff) << 2, new_rate); + } + + /* + * if old or new rate is equal to input rate, have to switch the clock + * off before changing and on after + */ + if (host->imclk == rate || host->imclk == host->rate || !rate) + usdhi6_write(host, USDHI6_SD_CLK_CTRL, + val & ~USDHI6_SD_CLK_CTRL_SCLKEN); + + if (!rate) { + host->rate = 0; + return; + } + + usdhi6_write(host, USDHI6_SD_CLK_CTRL, val); + + if (host->imclk == rate || host->imclk == host->rate || + !(val & USDHI6_SD_CLK_CTRL_SCLKEN)) + usdhi6_write(host, USDHI6_SD_CLK_CTRL, + val | USDHI6_SD_CLK_CTRL_SCLKEN); +} + +static void usdhi6_set_power(struct usdhi6_host *host, struct mmc_ios *ios) +{ + struct mmc_host *mmc = host->mmc; + + if (!IS_ERR(mmc->supply.vmmc)) + /* Errors ignored... */ + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, + ios->power_mode ? ios->vdd : 0); +} + +static int usdhi6_reset(struct usdhi6_host *host) +{ + int i; + + usdhi6_write(host, USDHI6_SOFT_RST, USDHI6_SOFT_RST_RESERVED); + cpu_relax(); + usdhi6_write(host, USDHI6_SOFT_RST, USDHI6_SOFT_RST_RESERVED | USDHI6_SOFT_RST_RESET); + for (i = 1000; i; i--) + if (usdhi6_read(host, USDHI6_SOFT_RST) & USDHI6_SOFT_RST_RESET) + break; + + return i ? 0 : -ETIMEDOUT; +} + +static void usdhi6_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct usdhi6_host *host = mmc_priv(mmc); + u32 option, mode; + int ret; + + dev_dbg(mmc_dev(mmc), "%uHz, OCR: %u, power %u, bus-width %u, timing %u\n", + ios->clock, ios->vdd, ios->power_mode, ios->bus_width, ios->timing); + + switch (ios->power_mode) { + case MMC_POWER_OFF: + usdhi6_set_power(host, ios); + usdhi6_only_cd(host); + break; + case MMC_POWER_UP: + /* + * We only also touch USDHI6_SD_OPTION from .request(), which + * cannot race with MMC_POWER_UP + */ + ret = usdhi6_reset(host); + if (ret < 0) { + dev_err(mmc_dev(mmc), "Cannot reset the interface!\n"); + } else { + usdhi6_set_power(host, ios); + usdhi6_only_cd(host); + } + break; + case MMC_POWER_ON: + option = usdhi6_read(host, USDHI6_SD_OPTION); + /* + * The eMMC standard only allows 4 or 8 bits in the DDR mode, + * the same probably holds for SD cards. We check here anyway, + * since the datasheet explicitly requires 4 bits for DDR. + */ + if (ios->bus_width == MMC_BUS_WIDTH_1) { + if (ios->timing == MMC_TIMING_UHS_DDR50) + dev_err(mmc_dev(mmc), + "4 bits are required for DDR\n"); + option |= USDHI6_SD_OPTION_WIDTH_1; + mode = 0; + } else { + option &= ~USDHI6_SD_OPTION_WIDTH_1; + mode = ios->timing == MMC_TIMING_UHS_DDR50; + } + usdhi6_write(host, USDHI6_SD_OPTION, option); + usdhi6_write(host, USDHI6_SDIF_MODE, mode); + break; + } + + if (host->rate != ios->clock) + usdhi6_clk_set(host, ios); +} + +/* This is data timeout. Response timeout is fixed to 640 clock cycles */ +static void usdhi6_timeout_set(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + u32 val; + unsigned long ticks; + + if (!mrq->data) + ticks = host->rate / 1000 * mrq->cmd->busy_timeout; + else + ticks = host->rate / 1000000 * (mrq->data->timeout_ns / 1000) + + mrq->data->timeout_clks; + + if (!ticks || ticks > 1 << 27) + /* Max timeout */ + val = 14; + else if (ticks < 1 << 13) + /* Min timeout */ + val = 0; + else + val = order_base_2(ticks) - 13; + + dev_dbg(mmc_dev(host->mmc), "Set %s timeout %lu ticks @ %lu Hz\n", + mrq->data ? "data" : "cmd", ticks, host->rate); + + /* Timeout Counter mask: 0xf0 */ + usdhi6_write(host, USDHI6_SD_OPTION, (val << USDHI6_SD_OPTION_TIMEOUT_SHIFT) | + (usdhi6_read(host, USDHI6_SD_OPTION) & ~USDHI6_SD_OPTION_TIMEOUT_MASK)); +} + +static void usdhi6_request_done(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + struct mmc_data *data = mrq->data; + + if (WARN(host->pg.page || host->head_pg.page, + "Page %p or %p not unmapped: wait %u, CMD%d(%c) @ +0x%x %ux%u in SG%u!\n", + host->pg.page, host->head_pg.page, host->wait, mrq->cmd->opcode, + data ? (data->flags & MMC_DATA_READ ? 'R' : 'W') : '-', + data ? host->offset : 0, data ? data->blocks : 0, + data ? data->blksz : 0, data ? data->sg_len : 0)) + usdhi6_sg_unmap(host, true); + + if (mrq->cmd->error || + (data && data->error) || + (mrq->stop && mrq->stop->error)) + dev_dbg(mmc_dev(host->mmc), "%s(CMD%d: %ux%u): err %d %d %d\n", + __func__, mrq->cmd->opcode, data ? data->blocks : 0, + data ? data->blksz : 0, + mrq->cmd->error, + data ? data->error : 1, + mrq->stop ? mrq->stop->error : 1); + + /* Disable DMA */ + usdhi6_write(host, USDHI6_CC_EXT_MODE, 0); + host->wait = USDHI6_WAIT_FOR_REQUEST; + host->mrq = NULL; + + mmc_request_done(host->mmc, mrq); +} + +static int usdhi6_cmd_flags(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + struct mmc_command *cmd = mrq->cmd; + u16 opc = cmd->opcode; + + if (host->app_cmd) { + host->app_cmd = false; + opc |= USDHI6_SD_CMD_APP; + } + + if (mrq->data) { + opc |= USDHI6_SD_CMD_DATA; + + if (mrq->data->flags & MMC_DATA_READ) + opc |= USDHI6_SD_CMD_READ; + + if (cmd->opcode == MMC_READ_MULTIPLE_BLOCK || + cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK || + (cmd->opcode == SD_IO_RW_EXTENDED && + mrq->data->blocks > 1)) { + opc |= USDHI6_SD_CMD_MULTI; + if (!mrq->stop) + opc |= USDHI6_SD_CMD_CMD12_AUTO_OFF; + } + + switch (mmc_resp_type(cmd)) { + case MMC_RSP_NONE: + opc |= USDHI6_SD_CMD_MODE_RSP_NONE; + break; + case MMC_RSP_R1: + opc |= USDHI6_SD_CMD_MODE_RSP_R1; + break; + case MMC_RSP_R1B: + opc |= USDHI6_SD_CMD_MODE_RSP_R1B; + break; + case MMC_RSP_R2: + opc |= USDHI6_SD_CMD_MODE_RSP_R2; + break; + case MMC_RSP_R3: + opc |= USDHI6_SD_CMD_MODE_RSP_R3; + break; + default: + dev_warn(mmc_dev(host->mmc), + "Unknown response type %d\n", + mmc_resp_type(cmd)); + return -EINVAL; + } + } + + return opc; +} + +static int usdhi6_rq_start(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + struct mmc_command *cmd = mrq->cmd; + struct mmc_data *data = mrq->data; + int opc = usdhi6_cmd_flags(host); + int i; + + if (opc < 0) + return opc; + + for (i = 1000; i; i--) { + if (!(usdhi6_read(host, USDHI6_SD_INFO2) & USDHI6_SD_INFO2_CBSY)) + break; + usleep_range(10, 100); + } + + if (!i) { + dev_dbg(mmc_dev(host->mmc), "Command active, request aborted\n"); + return -EAGAIN; + } + + if (data) { + bool use_dma; + int ret = 0; + + host->page_idx = 0; + + if (cmd->opcode == SD_IO_RW_EXTENDED && data->blocks > 1) { + switch (data->blksz) { + case 512: + break; + case 32: + case 64: + case 128: + case 256: + if (mrq->stop) + ret = -EINVAL; + break; + default: + ret = -EINVAL; + } + } else if ((cmd->opcode == MMC_READ_MULTIPLE_BLOCK || + cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK) && + data->blksz != 512) { + ret = -EINVAL; + } + + if (ret < 0) { + dev_warn(mmc_dev(host->mmc), "%s(): %u blocks of %u bytes\n", + __func__, data->blocks, data->blksz); + return -EINVAL; + } + + if (cmd->opcode == MMC_READ_MULTIPLE_BLOCK || + cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK || + (cmd->opcode == SD_IO_RW_EXTENDED && + data->blocks > 1)) + usdhi6_sg_prep(host); + + usdhi6_write(host, USDHI6_SD_SIZE, data->blksz); + + if ((data->blksz >= USDHI6_MIN_DMA || + data->blocks > 1) && + (data->blksz % 4 || + data->sg->offset % 4)) + dev_dbg(mmc_dev(host->mmc), + "Bad SG of %u: %ux%u @ %u\n", data->sg_len, + data->blksz, data->blocks, data->sg->offset); + + /* Enable DMA for USDHI6_MIN_DMA bytes or more */ + use_dma = data->blksz >= USDHI6_MIN_DMA && + !(data->blksz % 4) && + usdhi6_dma_start(host) >= DMA_MIN_COOKIE; + + if (use_dma) + usdhi6_write(host, USDHI6_CC_EXT_MODE, USDHI6_CC_EXT_MODE_SDRW); + + dev_dbg(mmc_dev(host->mmc), + "%s(): request opcode %u, %u blocks of %u bytes in %u segments, %s %s @+0x%x%s\n", + __func__, cmd->opcode, data->blocks, data->blksz, + data->sg_len, use_dma ? "DMA" : "PIO", + data->flags & MMC_DATA_READ ? "read" : "write", + data->sg->offset, mrq->stop ? " + stop" : ""); + } else { + dev_dbg(mmc_dev(host->mmc), "%s(): request opcode %u\n", + __func__, cmd->opcode); + } + + /* We have to get a command completion interrupt with DMA too */ + usdhi6_wait_for_resp(host); + + host->wait = USDHI6_WAIT_FOR_CMD; + schedule_delayed_work(&host->timeout_work, host->timeout); + + /* SEC bit is required to enable block counting by the core */ + usdhi6_write(host, USDHI6_SD_STOP, + data && data->blocks > 1 ? USDHI6_SD_STOP_SEC : 0); + usdhi6_write(host, USDHI6_SD_ARG, cmd->arg); + + /* Kick command execution */ + usdhi6_write(host, USDHI6_SD_CMD, opc); + + return 0; +} + +static void usdhi6_request(struct mmc_host *mmc, struct mmc_request *mrq) +{ + struct usdhi6_host *host = mmc_priv(mmc); + int ret; + + cancel_delayed_work_sync(&host->timeout_work); + + host->mrq = mrq; + host->sg = NULL; + + usdhi6_timeout_set(host); + ret = usdhi6_rq_start(host); + if (ret < 0) { + mrq->cmd->error = ret; + usdhi6_request_done(host); + } +} + +static int usdhi6_get_cd(struct mmc_host *mmc) +{ + struct usdhi6_host *host = mmc_priv(mmc); + /* Read is atomic, no need to lock */ + u32 status = usdhi6_read(host, USDHI6_SD_INFO1) & USDHI6_SD_INFO1_CD; + +/* + * level status.CD CD_ACTIVE_HIGH card present + * 1 0 0 0 + * 1 0 1 1 + * 0 1 0 1 + * 0 1 1 0 + */ + return !status ^ !(mmc->caps2 & MMC_CAP2_CD_ACTIVE_HIGH); +} + +static int usdhi6_get_ro(struct mmc_host *mmc) +{ + struct usdhi6_host *host = mmc_priv(mmc); + /* No locking as above */ + u32 status = usdhi6_read(host, USDHI6_SD_INFO1) & USDHI6_SD_INFO1_WP; + +/* + * level status.WP RO_ACTIVE_HIGH card read-only + * 1 0 0 0 + * 1 0 1 1 + * 0 1 0 1 + * 0 1 1 0 + */ + return !status ^ !(mmc->caps2 & MMC_CAP2_RO_ACTIVE_HIGH); +} + +static void usdhi6_enable_sdio_irq(struct mmc_host *mmc, int enable) +{ + struct usdhi6_host *host = mmc_priv(mmc); + + dev_dbg(mmc_dev(mmc), "%s(): %sable\n", __func__, enable ? "en" : "dis"); + + if (enable) { + host->sdio_mask = USDHI6_SDIO_INFO1_IRQ & ~USDHI6_SDIO_INFO1_IOIRQ; + usdhi6_write(host, USDHI6_SDIO_INFO1_MASK, host->sdio_mask); + usdhi6_write(host, USDHI6_SDIO_MODE, 1); + } else { + usdhi6_write(host, USDHI6_SDIO_MODE, 0); + usdhi6_write(host, USDHI6_SDIO_INFO1_MASK, USDHI6_SDIO_INFO1_IRQ); + host->sdio_mask = USDHI6_SDIO_INFO1_IRQ; + } +} + +static struct mmc_host_ops usdhi6_ops = { + .request = usdhi6_request, + .set_ios = usdhi6_set_ios, + .get_cd = usdhi6_get_cd, + .get_ro = usdhi6_get_ro, + .enable_sdio_irq = usdhi6_enable_sdio_irq, +}; + +/* State machine handlers */ + +static void usdhi6_resp_cmd12(struct usdhi6_host *host) +{ + struct mmc_command *cmd = host->mrq->stop; + cmd->resp[0] = usdhi6_read(host, USDHI6_SD_RSP10); +} + +static void usdhi6_resp_read(struct usdhi6_host *host) +{ + struct mmc_command *cmd = host->mrq->cmd; + u32 *rsp = cmd->resp, tmp = 0; + int i; + +/* + * RSP10 39-8 + * RSP32 71-40 + * RSP54 103-72 + * RSP76 127-104 + * R2-type response: + * resp[0] = r[127..96] + * resp[1] = r[95..64] + * resp[2] = r[63..32] + * resp[3] = r[31..0] + * Other responses: + * resp[0] = r[39..8] + */ + + if (mmc_resp_type(cmd) == MMC_RSP_NONE) + return; + + if (!(host->irq_status & USDHI6_SD_INFO1_RSP_END)) { + dev_err(mmc_dev(host->mmc), + "CMD%d: response expected but is missing!\n", cmd->opcode); + return; + } + + if (mmc_resp_type(cmd) & MMC_RSP_136) + for (i = 0; i < 4; i++) { + if (i) + rsp[3 - i] = tmp >> 24; + tmp = usdhi6_read(host, USDHI6_SD_RSP10 + i * 8); + rsp[3 - i] |= tmp << 8; + } + else if (cmd->opcode == MMC_READ_MULTIPLE_BLOCK || + cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK) + /* Read RSP54 to avoid conflict with auto CMD12 */ + rsp[0] = usdhi6_read(host, USDHI6_SD_RSP54); + else + rsp[0] = usdhi6_read(host, USDHI6_SD_RSP10); + + dev_dbg(mmc_dev(host->mmc), "Response 0x%x\n", rsp[0]); +} + +static int usdhi6_blk_read(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + u32 *p; + int i, rest; + + if (host->io_error) { + data->error = usdhi6_error_code(host); + goto error; + } + + if (host->pg.page) { + p = host->blk_page + host->offset; + } else { + p = usdhi6_sg_map(host); + if (!p) { + data->error = -ENOMEM; + goto error; + } + } + + for (i = 0; i < data->blksz / 4; i++, p++) + *p = usdhi6_read(host, USDHI6_SD_BUF0); + + rest = data->blksz % 4; + for (i = 0; i < (rest + 1) / 2; i++) { + u16 d = usdhi6_read16(host, USDHI6_SD_BUF0); + ((u8 *)p)[2 * i] = ((u8 *)&d)[0]; + if (rest > 1 && !i) + ((u8 *)p)[2 * i + 1] = ((u8 *)&d)[1]; + } + + return 0; + +error: + dev_dbg(mmc_dev(host->mmc), "%s(): %d\n", __func__, data->error); + host->wait = USDHI6_WAIT_FOR_REQUEST; + return data->error; +} + +static int usdhi6_blk_write(struct usdhi6_host *host) +{ + struct mmc_data *data = host->mrq->data; + u32 *p; + int i, rest; + + if (host->io_error) { + data->error = usdhi6_error_code(host); + goto error; + } + + if (host->pg.page) { + p = host->blk_page + host->offset; + } else { + p = usdhi6_sg_map(host); + if (!p) { + data->error = -ENOMEM; + goto error; + } + } + + for (i = 0; i < data->blksz / 4; i++, p++) + usdhi6_write(host, USDHI6_SD_BUF0, *p); + + rest = data->blksz % 4; + for (i = 0; i < (rest + 1) / 2; i++) { + u16 d; + ((u8 *)&d)[0] = ((u8 *)p)[2 * i]; + if (rest > 1 && !i) + ((u8 *)&d)[1] = ((u8 *)p)[2 * i + 1]; + else + ((u8 *)&d)[1] = 0; + usdhi6_write16(host, USDHI6_SD_BUF0, d); + } + + return 0; + +error: + dev_dbg(mmc_dev(host->mmc), "%s(): %d\n", __func__, data->error); + host->wait = USDHI6_WAIT_FOR_REQUEST; + return data->error; +} + +static int usdhi6_stop_cmd(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + + switch (mrq->cmd->opcode) { + case MMC_READ_MULTIPLE_BLOCK: + case MMC_WRITE_MULTIPLE_BLOCK: + if (mrq->stop->opcode == MMC_STOP_TRANSMISSION) { + host->wait = USDHI6_WAIT_FOR_STOP; + return 0; + } + /* Unsupported STOP command */ + default: + dev_err(mmc_dev(host->mmc), + "unsupported stop CMD%d for CMD%d\n", + mrq->stop->opcode, mrq->cmd->opcode); + mrq->stop->error = -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static bool usdhi6_end_cmd(struct usdhi6_host *host) +{ + struct mmc_request *mrq = host->mrq; + struct mmc_command *cmd = mrq->cmd; + + if (host->io_error) { + cmd->error = usdhi6_error_code(host); + return false; + } + + usdhi6_resp_read(host); + + if (!mrq->data) + return false; + + if (host->dma_active) { + usdhi6_dma_kick(host); + if (!mrq->stop) + host->wait = USDHI6_WAIT_FOR_DMA; + else if (usdhi6_stop_cmd(host) < 0) + return false; + } else if (mrq->data->flags & MMC_DATA_READ) { + if (cmd->opcode == MMC_READ_MULTIPLE_BLOCK || + (cmd->opcode == SD_IO_RW_EXTENDED && + mrq->data->blocks > 1)) + host->wait = USDHI6_WAIT_FOR_MREAD; + else + host->wait = USDHI6_WAIT_FOR_READ; + } else { + if (cmd->opcode == MMC_WRITE_MULTIPLE_BLOCK || + (cmd->opcode == SD_IO_RW_EXTENDED && + mrq->data->blocks > 1)) + host->wait = USDHI6_WAIT_FOR_MWRITE; + else + host->wait = USDHI6_WAIT_FOR_WRITE; + } + + return true; +} + +static bool usdhi6_read_block(struct usdhi6_host *host) +{ + /* ACCESS_END IRQ is already unmasked */ + int ret = usdhi6_blk_read(host); + + /* + * Have to force unmapping both pages: the single block could have been + * cross-page, in which case for single-block IO host->page_idx == 0. + * So, if we don't force, the second page won't be unmapped. + */ + usdhi6_sg_unmap(host, true); + + if (ret < 0) + return false; + + host->wait = USDHI6_WAIT_FOR_DATA_END; + return true; +} + +static bool usdhi6_mread_block(struct usdhi6_host *host) +{ + int ret = usdhi6_blk_read(host); + + if (ret < 0) + return false; + + usdhi6_sg_advance(host); + + return !host->mrq->data->error && + (host->wait != USDHI6_WAIT_FOR_DATA_END || !host->mrq->stop); +} + +static bool usdhi6_write_block(struct usdhi6_host *host) +{ + int ret = usdhi6_blk_write(host); + + /* See comment in usdhi6_read_block() */ + usdhi6_sg_unmap(host, true); + + if (ret < 0) + return false; + + host->wait = USDHI6_WAIT_FOR_DATA_END; + return true; +} + +static bool usdhi6_mwrite_block(struct usdhi6_host *host) +{ + int ret = usdhi6_blk_write(host); + + if (ret < 0) + return false; + + usdhi6_sg_advance(host); + + return !host->mrq->data->error && + (host->wait != USDHI6_WAIT_FOR_DATA_END || !host->mrq->stop); +} + +/* Interrupt & timeout handlers */ + +static irqreturn_t usdhi6_sd_bh(int irq, void *dev_id) +{ + struct usdhi6_host *host = dev_id; + struct mmc_request *mrq; + struct mmc_command *cmd; + struct mmc_data *data; + bool io_wait = false; + + cancel_delayed_work_sync(&host->timeout_work); + + mrq = host->mrq; + if (!mrq) + return IRQ_HANDLED; + + cmd = mrq->cmd; + data = mrq->data; + + switch (host->wait) { + case USDHI6_WAIT_FOR_REQUEST: + /* We're too late, the timeout has already kicked in */ + return IRQ_HANDLED; + case USDHI6_WAIT_FOR_CMD: + /* Wait for data? */ + io_wait = usdhi6_end_cmd(host); + break; + case USDHI6_WAIT_FOR_MREAD: + /* Wait for more data? */ + io_wait = usdhi6_mread_block(host); + break; + case USDHI6_WAIT_FOR_READ: + /* Wait for data end? */ + io_wait = usdhi6_read_block(host); + break; + case USDHI6_WAIT_FOR_MWRITE: + /* Wait data to write? */ + io_wait = usdhi6_mwrite_block(host); + break; + case USDHI6_WAIT_FOR_WRITE: + /* Wait for data end? */ + io_wait = usdhi6_write_block(host); + break; + case USDHI6_WAIT_FOR_DMA: + usdhi6_dma_check_error(host); + break; + case USDHI6_WAIT_FOR_STOP: + usdhi6_write(host, USDHI6_SD_STOP, 0); + if (host->io_error) { + int ret = usdhi6_error_code(host); + if (mrq->stop) + mrq->stop->error = ret; + else + mrq->data->error = ret; + dev_warn(mmc_dev(host->mmc), "%s(): %d\n", __func__, ret); + break; + } + usdhi6_resp_cmd12(host); + mrq->stop->error = 0; + break; + case USDHI6_WAIT_FOR_DATA_END: + if (host->io_error) { + mrq->data->error = usdhi6_error_code(host); + dev_warn(mmc_dev(host->mmc), "%s(): %d\n", __func__, + mrq->data->error); + } + break; + default: + cmd->error = -EFAULT; + dev_err(mmc_dev(host->mmc), "Invalid state %u\n", host->wait); + usdhi6_request_done(host); + return IRQ_HANDLED; + } + + if (io_wait) { + schedule_delayed_work(&host->timeout_work, host->timeout); + /* Wait for more data or ACCESS_END */ + if (!host->dma_active) + usdhi6_wait_for_brwe(host, mrq->data->flags & MMC_DATA_READ); + return IRQ_HANDLED; + } + + if (!cmd->error) { + if (data) { + if (!data->error) { + if (host->wait != USDHI6_WAIT_FOR_STOP && + host->mrq->stop && + !host->mrq->stop->error && + !usdhi6_stop_cmd(host)) { + /* Sending STOP */ + usdhi6_wait_for_resp(host); + + schedule_delayed_work(&host->timeout_work, + host->timeout); + + return IRQ_HANDLED; + } + + data->bytes_xfered = data->blocks * data->blksz; + } else { + /* Data error: might need to unmap the last page */ + dev_warn(mmc_dev(host->mmc), "%s(): data error %d\n", + __func__, data->error); + usdhi6_sg_unmap(host, true); + } + } else if (cmd->opcode == MMC_APP_CMD) { + host->app_cmd = true; + } + } + + usdhi6_request_done(host); + + return IRQ_HANDLED; +} + +static irqreturn_t usdhi6_sd(int irq, void *dev_id) +{ + struct usdhi6_host *host = dev_id; + u16 status, status2, error; + + status = usdhi6_read(host, USDHI6_SD_INFO1) & ~host->status_mask & + ~USDHI6_SD_INFO1_CARD; + status2 = usdhi6_read(host, USDHI6_SD_INFO2) & ~host->status2_mask; + + usdhi6_only_cd(host); + + dev_dbg(mmc_dev(host->mmc), + "IRQ status = 0x%08x, status2 = 0x%08x\n", status, status2); + + if (!status && !status2) + return IRQ_NONE; + + error = status2 & USDHI6_SD_INFO2_ERR; + + /* Ack / clear interrupts */ + if (USDHI6_SD_INFO1_IRQ & status) + usdhi6_write(host, USDHI6_SD_INFO1, + 0xffff & ~(USDHI6_SD_INFO1_IRQ & status)); + + if (USDHI6_SD_INFO2_IRQ & status2) { + if (error) + /* In error cases BWE and BRE aren't cleared automatically */ + status2 |= USDHI6_SD_INFO2_BWE | USDHI6_SD_INFO2_BRE; + + usdhi6_write(host, USDHI6_SD_INFO2, + 0xffff & ~(USDHI6_SD_INFO2_IRQ & status2)); + } + + host->io_error = error; + host->irq_status = status; + + if (error) { + /* Don't pollute the log with unsupported command timeouts */ + if (host->wait != USDHI6_WAIT_FOR_CMD || + error != USDHI6_SD_INFO2_RSP_TOUT) + dev_warn(mmc_dev(host->mmc), + "%s(): INFO2 error bits 0x%08x\n", + __func__, error); + else + dev_dbg(mmc_dev(host->mmc), + "%s(): INFO2 error bits 0x%08x\n", + __func__, error); + } + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t usdhi6_sdio(int irq, void *dev_id) +{ + struct usdhi6_host *host = dev_id; + u32 status = usdhi6_read(host, USDHI6_SDIO_INFO1) & ~host->sdio_mask; + + dev_dbg(mmc_dev(host->mmc), "%s(): status 0x%x\n", __func__, status); + + if (!status) + return IRQ_NONE; + + usdhi6_write(host, USDHI6_SDIO_INFO1, ~status); + + mmc_signal_sdio_irq(host->mmc); + + return IRQ_HANDLED; +} + +static irqreturn_t usdhi6_cd(int irq, void *dev_id) +{ + struct usdhi6_host *host = dev_id; + struct mmc_host *mmc = host->mmc; + u16 status; + + /* We're only interested in hotplug events here */ + status = usdhi6_read(host, USDHI6_SD_INFO1) & ~host->status_mask & + USDHI6_SD_INFO1_CARD; + + if (!status) + return IRQ_NONE; + + /* Ack */ + usdhi6_write(host, USDHI6_SD_INFO1, !status); + + if (!work_pending(&mmc->detect.work) && + (((status & USDHI6_SD_INFO1_CARD_INSERT) && + !mmc->card) || + ((status & USDHI6_SD_INFO1_CARD_EJECT) && + mmc->card))) + mmc_detect_change(mmc, msecs_to_jiffies(100)); + + return IRQ_HANDLED; +} + +/* + * Actually this should not be needed, if the built-in timeout works reliably in + * the both PIO cases and DMA never fails. But if DMA does fail, a timeout + * handler might be the only way to catch the error. + */ +static void usdhi6_timeout_work(struct work_struct *work) +{ + struct delayed_work *d = container_of(work, struct delayed_work, work); + struct usdhi6_host *host = container_of(d, struct usdhi6_host, timeout_work); + struct mmc_request *mrq = host->mrq; + struct mmc_data *data = mrq ? mrq->data : NULL; + + dev_warn(mmc_dev(host->mmc), + "%s timeout wait %u CMD%d: IRQ 0x%08x:0x%08x, last IRQ 0x%08x\n", + host->dma_active ? "DMA" : "PIO", + host->wait, mrq ? mrq->cmd->opcode : -1, + usdhi6_read(host, USDHI6_SD_INFO1), + usdhi6_read(host, USDHI6_SD_INFO2), host->irq_status); + + if (host->dma_active) { + usdhi6_dma_kill(host); + usdhi6_dma_stop_unmap(host); + } + + switch (host->wait) { + default: + dev_err(mmc_dev(host->mmc), "Invalid state %u\n", host->wait); + /* mrq can be NULL in this actually impossible case */ + case USDHI6_WAIT_FOR_CMD: + usdhi6_error_code(host); + if (mrq) + mrq->cmd->error = -ETIMEDOUT; + break; + case USDHI6_WAIT_FOR_STOP: + usdhi6_error_code(host); + mrq->stop->error = -ETIMEDOUT; + break; + case USDHI6_WAIT_FOR_DMA: + case USDHI6_WAIT_FOR_MREAD: + case USDHI6_WAIT_FOR_MWRITE: + case USDHI6_WAIT_FOR_READ: + case USDHI6_WAIT_FOR_WRITE: + dev_dbg(mmc_dev(host->mmc), + "%c: page #%u @ +0x%x %ux%u in SG%u. Current SG %u bytes @ %u\n", + data->flags & MMC_DATA_READ ? 'R' : 'W', host->page_idx, + host->offset, data->blocks, data->blksz, data->sg_len, + sg_dma_len(host->sg), host->sg->offset); + usdhi6_sg_unmap(host, true); + /* + * If USDHI6_WAIT_FOR_DATA_END times out, we have already unmapped + * the page + */ + case USDHI6_WAIT_FOR_DATA_END: + usdhi6_error_code(host); + data->error = -ETIMEDOUT; + } + + if (mrq) + usdhi6_request_done(host); +} + +/* Probe / release */ + +static const struct of_device_id usdhi6_of_match[] = { + {.compatible = "renesas,usdhi6rol0"}, + {} +}; +MODULE_DEVICE_TABLE(of, usdhi6_of_match); + +static int usdhi6_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct mmc_host *mmc; + struct usdhi6_host *host; + struct resource *res; + int irq_cd, irq_sd, irq_sdio; + u32 version; + int ret; + + if (!dev->of_node) + return -ENODEV; + + irq_cd = platform_get_irq_byname(pdev, "card detect"); + irq_sd = platform_get_irq_byname(pdev, "data"); + irq_sdio = platform_get_irq_byname(pdev, "SDIO"); + if (irq_sd < 0 || irq_sdio < 0) + return -ENODEV; + + mmc = mmc_alloc_host(sizeof(struct usdhi6_host), dev); + if (!mmc) + return -ENOMEM; + + ret = mmc_of_parse(mmc); + if (ret < 0) + goto e_free_mmc; + + mmc_regulator_get_supply(mmc); + + host = mmc_priv(mmc); + host->mmc = mmc; + host->wait = USDHI6_WAIT_FOR_REQUEST; + host->timeout = msecs_to_jiffies(4000); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + host->base = devm_ioremap_resource(dev, res); + if (IS_ERR(host->base)) { + ret = PTR_ERR(host->base); + goto e_free_mmc; + } + + host->clk = devm_clk_get(dev, NULL); + if (IS_ERR(host->clk)) + goto e_free_mmc; + + host->imclk = clk_get_rate(host->clk); + + ret = clk_prepare_enable(host->clk); + if (ret < 0) + goto e_free_mmc; + + version = usdhi6_read(host, USDHI6_VERSION); + if ((version & 0xfff) != 0xa0d) { + dev_err(dev, "Version not recognized %x\n", version); + goto e_clk_off; + } + + dev_info(dev, "A USDHI6ROL0 SD host detected with %d ports\n", + usdhi6_read(host, USDHI6_SD_PORT_SEL) >> USDHI6_SD_PORT_SEL_PORTS_SHIFT); + + usdhi6_mask_all(host); + + if (irq_cd >= 0) { + ret = devm_request_irq(dev, irq_cd, usdhi6_cd, 0, + dev_name(dev), host); + if (ret < 0) + goto e_clk_off; + } else { + mmc->caps |= MMC_CAP_NEEDS_POLL; + } + + ret = devm_request_threaded_irq(dev, irq_sd, usdhi6_sd, usdhi6_sd_bh, 0, + dev_name(dev), host); + if (ret < 0) + goto e_clk_off; + + ret = devm_request_irq(dev, irq_sdio, usdhi6_sdio, 0, + dev_name(dev), host); + if (ret < 0) + goto e_clk_off; + + INIT_DELAYED_WORK(&host->timeout_work, usdhi6_timeout_work); + + usdhi6_dma_request(host, res->start); + + mmc->ops = &usdhi6_ops; + mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED | + MMC_CAP_UHS_SDR50 | MMC_CAP_UHS_DDR50 | MMC_CAP_SDIO_IRQ; + /* Set .max_segs to some random number. Feel free to adjust. */ + mmc->max_segs = 32; + mmc->max_blk_size = 512; + mmc->max_req_size = PAGE_CACHE_SIZE * mmc->max_segs; + mmc->max_blk_count = mmc->max_req_size / mmc->max_blk_size; + /* + * Setting .max_seg_size to 1 page would simplify our page-mapping code, + * But OTOH, having large segments makes DMA more efficient. We could + * check, whether we managed to get DMA and fall back to 1 page + * segments, but if we do manage to obtain DMA and then it fails at + * run-time and we fall back to PIO, we will continue getting large + * segments. So, we wouldn't be able to get rid of the code anyway. + */ + mmc->max_seg_size = mmc->max_req_size; + if (!mmc->f_max) + mmc->f_max = host->imclk; + mmc->f_min = host->imclk / 512; + + platform_set_drvdata(pdev, host); + + ret = mmc_add_host(mmc); + if (ret < 0) + goto e_clk_off; + + return 0; + +e_clk_off: + clk_disable_unprepare(host->clk); +e_free_mmc: + mmc_free_host(mmc); + + return ret; +} + +static int usdhi6_remove(struct platform_device *pdev) +{ + struct usdhi6_host *host = platform_get_drvdata(pdev); + + mmc_remove_host(host->mmc); + + usdhi6_mask_all(host); + cancel_delayed_work_sync(&host->timeout_work); + usdhi6_dma_release(host); + clk_disable_unprepare(host->clk); + mmc_free_host(host->mmc); + + return 0; +} + +static struct platform_driver usdhi6_driver = { + .probe = usdhi6_probe, + .remove = usdhi6_remove, + .driver = { + .name = "usdhi6rol0", + .owner = THIS_MODULE, + .of_match_table = usdhi6_of_match, + }, +}; + +module_platform_driver(usdhi6_driver); + +MODULE_DESCRIPTION("Renesas usdhi6rol0 SD/SDIO host driver"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:usdhi6rol0"); +MODULE_AUTHOR("Guennadi Liakhovetski <g.liakhovetski@gmx.de>"); diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 498d1f799085..282891a8e451 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -840,7 +840,7 @@ static int wmt_mci_probe(struct platform_device *pdev) priv->dma_desc_buffer = dma_alloc_coherent(&pdev->dev, mmc->max_blk_count * 16, &priv->dma_desc_device_addr, - 208); + GFP_KERNEL); if (!priv->dma_desc_buffer) { dev_err(&pdev->dev, "DMA alloc fail\n"); ret = -EPERM; diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c index 2e39d38d6a9e..46e7af446f01 100644 --- a/drivers/net/wireless/rsi/rsi_91x_sdio.c +++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c @@ -285,7 +285,6 @@ static void rsi_reset_card(struct sdio_func *pfunction) if (err) { rsi_dbg(ERR_ZONE, "%s: CCCR speed reg read failed: %d\n", __func__, err); - card->state &= ~MMC_STATE_HIGHSPEED; } else { err = rsi_cmd52writebyte(card, SDIO_CCCR_SPEED, @@ -296,14 +295,13 @@ static void rsi_reset_card(struct sdio_func *pfunction) __func__, err); return; } - mmc_card_set_highspeed(card); host->ios.timing = MMC_TIMING_SD_HS; host->ops->set_ios(host, &host->ios); } } /* Set clock */ - if (mmc_card_highspeed(card)) + if (mmc_card_hs(card)) clock = 50000000; else clock = card->cis.max_dtr; diff --git a/fs/attr.c b/fs/attr.c index 5d4e59d56e85..6530ced19697 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || !uid_eq(attr->ia_uid, inode->i_uid)) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -160,7 +160,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) umode_t mode = attr->ia_mode; if (!in_group_p(inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/fs/inode.c b/fs/inode.c index 2feb9b69f1be..6eecb7ff0b9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { + struct user_namespace *ns; + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (inode_capable(inode, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 00ec0b9c94d1..d3e40db28930 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -14,6 +14,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs3.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 9a55797a1cd4..3e9f7874b975 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,6 +15,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index de051cb1f553..8f27c93f8d2e 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -622,8 +622,8 @@ static int __init init_nlm(void) err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); -#endif err_sysctl: +#endif return err; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dc5c75930f0f..b6f3b84b6e99 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -14,12 +14,11 @@ #include <linux/mutex.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/addr.h> -#include <linux/nfsd/nfsfh.h> -#include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> #include <linux/module.h> #include <linux/mount.h> +#include <uapi/linux/nfs2.h> #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 964666c68a86..9340e7e10ef6 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -16,6 +16,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR diff --git a/fs/namei.c b/fs/namei.c index 80168273396b..985c6f368485 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -332,10 +332,11 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -345,7 +346,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -353,7 +354,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -2379,7 +2380,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } /* diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 03192a66c143..4782e0840dcc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o -obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o - +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 65d849bdf77a..9b431f44fad9 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -210,7 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err) SetPageUptodate(bvec->bv_page); if (err) { - struct nfs_read_data *rdata = par->data; + struct nfs_pgio_data *rdata = par->data; struct nfs_pgio_header *header = rdata->header; if (!header->pnfs_error) @@ -224,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err) static void bl_read_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } static void bl_end_par_io_read(void *data, int unused) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rdata->task.tk_status = rdata->header->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); @@ -242,7 +242,7 @@ bl_end_par_io_read(void *data, int unused) } static enum pnfs_try_status -bl_read_pagelist(struct nfs_read_data *rdata) +bl_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *header = rdata->header; int i, hole; @@ -390,7 +390,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err) } if (unlikely(err)) { - struct nfs_write_data *data = par->data; + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!header->pnfs_error) @@ -405,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nfs_write_data *data = par->data; + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!uptodate) { @@ -423,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err) static void bl_write_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); if (likely(!wdata->header->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), @@ -438,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work) /* Called when last of bios associated with a bl_write_pagelist call finishes */ static void bl_end_par_io_write(void *data, int num_se) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(wdata->header->pnfs_error)) { bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, @@ -673,7 +673,7 @@ check_page: } static enum pnfs_try_status -bl_write_pagelist(struct nfs_write_data *wdata, int sync) +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) { struct nfs_pgio_header *header = wdata->header; int i, ret, npg_zero, pg_index, last = 0; @@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) pnfs_generic_pg_init_read(pgio, req); } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, SECTOR_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } @@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, PAGE_CACHE_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b8797ae6831f..4ad7bc388679 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, + struct nfs_client *ds_clp, + int ds_idx) +{ + struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 + if (ds_clp) { + /* pNFS is in use, use the DS verf */ + if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; + else + WARN_ON_ONCE(1); + } +#endif + return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + WARN_ON_ONCE(verfp->committed >= 0); + memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + if (verfp->committed < 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + return 0; + } + return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, + struct nfs_commit_data *data) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, data->ds_clp, + data->ds_commit_index); + WARN_ON_ONCE(verfp->committed < 0); + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif + /** * nfs_direct_IO - NFS address space operation for direct I/O * @rw: direction (read or write) @@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); + dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); @@ -380,8 +472,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -424,7 +515,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; unsigned long seg; - NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, + nfs_pageio_init_read(&desc, dreq->inode, false, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; @@ -564,7 +655,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->count = 0; get_dreq(dreq); - NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; @@ -603,7 +694,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) dprintk("NFS: %5u commit failed with error %d.\n", data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } @@ -750,8 +841,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -813,13 +903,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, hdr->verf, - sizeof(dreq->verf)); + nfs_direct_set_hdr_verf(dreq, hdr); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { + dreq->flags = + NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else bit = NFS_IOHDR_NEED_COMMIT; @@ -829,6 +919,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { + bool do_destroy = true; + req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); switch (bit) { @@ -836,6 +928,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) case NFS_IOHDR_NEED_COMMIT: kref_get(&req->wb_kref); nfs_mark_request_commit(req, hdr->lseg, &cinfo); + do_destroy = false; } nfs_unlock_and_release_request(req); } @@ -874,7 +967,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; unsigned long seg; - NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, + nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 000000000000..8516cdffb9e9 --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c index b9a35c05b60f..d2eba1c13b7e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -35,11 +35,11 @@ #include <linux/sunrpc/metrics.h> -#include "nfs4session.h" -#include "internal.h" -#include "delegation.h" -#include "nfs4filelayout.h" -#include "nfs4trace.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -84,7 +84,7 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -static void filelayout_reset_write(struct nfs_write_data *data) +static void filelayout_reset_write(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; @@ -105,7 +105,7 @@ static void filelayout_reset_write(struct nfs_write_data *data) } } -static void filelayout_reset_read(struct nfs_read_data *data) +static void filelayout_reset_read(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; @@ -243,7 +243,7 @@ wait_on_recovery: /* NFS_PROTO call done callback routines */ static int filelayout_read_done_cb(struct rpc_task *task, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task, * rfc5661 is not clear about which credential should be used. */ static void -filelayout_set_layoutcommit(struct nfs_write_data *wdata) +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; @@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) return; pnfs_set_layoutcommit(wdata); - dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -317,7 +317,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - rdata->read_done_cb = filelayout_read_done_cb; + rdata->pgio_done_cb = filelayout_read_done_cb; if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, @@ -331,7 +331,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); @@ -347,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -363,7 +363,7 @@ static void filelayout_read_release(void *data) } static int filelayout_write_done_cb(struct rpc_task *task, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -419,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -443,7 +443,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && task->tk_status == 0) { @@ -457,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data) static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -529,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = { }; static enum pnfs_try_status -filelayout_read_pagelist(struct nfs_read_data *data) +filelayout_read_pagelist(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -560,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) /* No multipath support. Use first DS */ atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; @@ -568,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_read(ds_clnt, data, - &filelayout_read_call_ops, RPC_TASK_SOFTCONN); + nfs_initiate_pgio(ds_clnt, data, + &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } /* Perform async writes. */ static enum pnfs_try_status -filelayout_write_pagelist(struct nfs_write_data *data, int sync) +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -600,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); - data->write_done_cb = filelayout_write_done_cb; + data->pgio_done_cb = filelayout_write_done_cb; atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; - /* - * Get the file offset on the dserver. Set the write offset to - * this offset and save the original offset. - */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_write(ds_clnt, data, + nfs_initiate_pgio(ds_clnt, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; @@ -637,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_deviceid_node *d; struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; - struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); dprintk("--> %s\n", __func__); @@ -655,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + if (!fl->stripe_unit) { dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; @@ -692,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out_put; } - if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { - dprintk("%s Stripe unit (%u) not aligned with rsize %u " - "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, - nfss->wsize); - } - status = 0; out: dprintk("--> %s returns %d\n", __func__, status); @@ -850,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); struct pnfs_commit_bucket *buckets; - int size; + int size, i; if (fl->commit_through_mds) return 0; - if (cinfo->ds->nbuckets != 0) { + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + if (cinfo->ds->nbuckets >= size) { /* This assumes there is only one IOMODE_RW lseg. What * we really want to do is have a layout_hdr level * dictionary of <multipath_list4, fh> keys, each @@ -864,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, return 0; } - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), gfp_flags); if (!buckets) return -ENOMEM; - else { - int i; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + } - spin_lock(cinfo->lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - } - } - spin_unlock(cinfo->lock); - return 0; + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets >= size) + goto out; + for (i = 0; i < cinfo->ds->nbuckets; i++) { + list_splice(&cinfo->ds->buckets[i].written, + &buckets[i].written); + list_splice(&cinfo->ds->buckets[i].committing, + &buckets[i].committing); + buckets[i].direct_verf.committed = + cinfo->ds->buckets[i].direct_verf.committed; + buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; + buckets[i].clseg = cinfo->ds->buckets[i].clseg; } + swap(cinfo->ds->buckets, buckets); + cinfo->ds->nbuckets = size; +out: + spin_unlock(cinfo->lock); + kfree(buckets); + return 0; } static struct pnfs_layout_segment * @@ -915,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * - * return true : coalesce page - * return false : don't coalesce page + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. */ -static bool +static size_t filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { + unsigned int size; u64 p_stripe, r_stripe; - u32 stripe_unit; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req) || - !nfs_generic_pg_test(pgio, prev, req)) - return false; + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; - p_stripe = (u64)req_offset(prev); - r_stripe = (u64)req_offset(req); - stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); - do_div(p_stripe, stripe_unit); - do_div(r_stripe, stripe_unit); + if (p_stripe != r_stripe) + return 0; + } - return (p_stripe == r_stripe); + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); } static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) { - /* - * Handling unaligned pages is difficult, because have to - * somehow split a req in two in certain cases in the - * pg.test code. Avoid this by just not using pnfs - * in this case. - */ - nfs_pageio_reset_read_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -973,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) - goto out_mds; - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -1067,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req, */ j = nfs4_fl_calc_j_index(lseg, req_offset(req)); i = select_bucket_index(fl, j); + spin_lock(cinfo->lock); buckets = cinfo->ds->buckets; list = &buckets[i].written; if (list_empty(list)) { @@ -1080,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); return list; } @@ -1176,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst, return ret; } +/* Note called with cinfo->lock held. */ static int filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, @@ -1220,15 +1226,18 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; int i; +restart: spin_lock(cinfo->lock); for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - spin_unlock(cinfo->lock); - pnfs_put_lseg(b->wlseg); + freeme = b->wlseg; b->wlseg = NULL; - spin_lock(cinfo->lock); + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + goto restart; } } cinfo->ds->nwritten = 0; @@ -1243,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) struct nfs_commit_data *data; int i, j; unsigned int nreq = 0; + struct pnfs_layout_segment *freeme; fl_cinfo = cinfo->ds; bucket = fl_cinfo->buckets; @@ -1253,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (!data) break; data->ds_commit_index = i; + spin_lock(cinfo->lock); data->lseg = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -1264,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - pnfs_put_lseg(bucket->clseg); + spin_lock(cinfo->lock); + freeme = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); } /* Caller will clean up entries put on list */ return nreq; @@ -1330,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return &flo->generic_hdr; + return flo != NULL ? &flo->generic_hdr : NULL; } static void diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h index cebd20e7e923..ffbddf2219ea 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -30,7 +30,7 @@ #ifndef FS_NFS_NFS4FILELAYOUT_H #define FS_NFS_NFS4FILELAYOUT_H -#include "pnfs.h" +#include "../pnfs.h" /* * Default data server connection timeout and retrans vaules. diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index b9c61efe9660..44bf0140a4c7 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -33,9 +33,9 @@ #include <linux/module.h> #include <linux/sunrpc/addr.h> -#include "internal.h" -#include "nfs4session.h" -#include "nfs4filelayout.h" +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 66984a9aafaa..b94f80420a58 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, security_d_instantiate(ret, inode); spin_lock(&ret->d_lock); - if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + if (IS_ROOT(ret) && !ret->d_fsdata && + !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { ret->d_fsdata = name; name = NULL; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e6f7398d2b3c..c496f8a74639 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1575,18 +1575,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) - invalid |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ @@ -1608,7 +1610,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)new_isize); } } else - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); @@ -1616,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); else if (server->caps & NFS_CAP_ATIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { @@ -1627,7 +1631,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } } else if (server->caps & NFS_CAP_MODE) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1638,7 +1643,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1649,7 +1655,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1662,7 +1669,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dd8bfc2e2464..8b69cba1bb04 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -231,13 +231,20 @@ extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_io_counter *c); +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, + const struct rpc_call_ops *, int, int); + static inline void nfs_iocounter_init(struct nfs_io_counter *c) { c->flags = 0; @@ -395,19 +402,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool struct nfs_pgio_completion_ops; /* read.c */ -extern struct nfs_read_header *nfs_readhdr_alloc(void); -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr); extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_readdata_release(struct nfs_read_data *rdata); /* super.c */ void nfs_clone_super(struct super_block *, struct nfs_mount_info *); @@ -422,19 +421,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern struct nfs_write_header *nfs_writehdr_alloc(void); -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_commit_data *p); -extern int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, @@ -447,6 +437,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, @@ -492,7 +483,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void __nfs4_read_done_cb(struct nfs_pgio_data *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 62db136339ea..5f61b83f4a1c 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) /* * typedef opaque nfsdata<>; */ -static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) { u32 recvd, count; __be32 *p; @@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, * }; */ static void encode_readargs(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_readargs(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, * }; */ static void encode_writeargs(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_writeargs(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -857,7 +857,7 @@ out_default: * }; */ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -878,7 +878,7 @@ out_default: } static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index db60149c4579..e7daa42bbc86 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -795,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -807,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -829,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } -static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { rpc_call_start(task); @@ -946,13 +940,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .fsinfo = nfs3_proc_fsinfo, .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, + .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare, .read_setup = nfs3_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs3_proc_read_rpc_prepare, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index fa6d72131c19..8f4cbe7f4aa8 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, * }; */ static void encode_read3args(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, * }; */ static void encode_write3args(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_write3args(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -1589,7 +1589,7 @@ out_default: * }; */ static int decode_read3resok(struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { u32 eof, count, ocount, recvd; __be32 *p; @@ -1625,7 +1625,7 @@ out_overflow: } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -1673,7 +1673,7 @@ out_status: * }; */ static int decode_write3resok(struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { __be32 *p; @@ -1697,7 +1697,7 @@ out_eio: } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e1d1badbe53c..f63cb87cd730 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -337,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, */ static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) @@ -369,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8de3407e0360..464db9dd6318 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -100,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) break; mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret && !datasync) - /* application has asked for meta-data sync */ + if (!ret) ret = pnfs_layoutcommit_inode(inode, true); mutex_unlock(&inode->i_mutex); /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 397be39c6dc8..285ad5334018 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2027,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } @@ -2750,7 +2750,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4033,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_read_data *data) +void __nfs4_read_done_cb(struct nfs_pgio_data *data) { nfs_invalidate_atime(data->header->inode); } -static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4055,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) } static bool nfs4_read_stateid_changed(struct rpc_task *task, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4068,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { dprintk("--> %s\n", __func__); @@ -4077,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return -EAGAIN; if (nfs4_read_stateid_changed(task, &data->args)) return -EAGAIN; - return data->read_done_cb ? data->read_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_read_done_cb(task, data); } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { data->timestamp = jiffies; - data->read_done_cb = nfs4_read_done_cb; + data->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); } -static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, @@ -4097,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat task)) return 0; if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_READ) == -EIO) + data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) return -EIO; return 0; } -static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -4121,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data } static bool nfs4_write_stateid_changed(struct rpc_task *task, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4134,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; if (nfs4_write_stateid_changed(task, &data->args)) return -EAGAIN; - return data->write_done_cb ? data->write_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_write_done_cb(task, data); } static -bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) { const struct nfs_pgio_header *hdr = data->header; @@ -4158,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4168,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag } else data->args.bitmask = server->cache_consistency_bitmask; - if (!data->write_done_cb) - data->write_done_cb = nfs4_write_done_cb; + if (!data->pgio_done_cb) + data->pgio_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; @@ -4177,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } -static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return 0; - if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_WRITE) == -EIO) - return -EIO; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) - return -EIO; - return 0; -} - static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { nfs4_setup_sequence(NFS_SERVER(data->inode), @@ -8432,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .pathconf = nfs4_proc_pathconf, .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, + .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare, .read_setup = nfs4_proc_read_setup, - .read_pageio_init = pnfs_pageio_init_read, - .read_rpc_prepare = nfs4_proc_read_rpc_prepare, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, - .write_pageio_init = pnfs_pageio_init_write, - .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c0583b9bef71..848f6853c59e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1456,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - write_seqcount_begin(&sp->so_reclaim_seqcount); + raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1519,13 +1519,13 @@ restart: spin_lock(&sp->so_lock); goto restart; } - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 849cf146db30..0a744f3a86f6 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( - const struct nfs_read_data *data, + const struct nfs_pgio_data *data, int error ), @@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, #define DEFINE_NFS4_READ_EVENT(name) \ DEFINE_EVENT(nfs4_read_event, name, \ TP_PROTO( \ - const struct nfs_read_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) @@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); DECLARE_EVENT_CLASS(nfs4_write_event, TP_PROTO( - const struct nfs_write_data *data, + const struct nfs_pgio_data *data, int error ), @@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, #define DEFINE_NFS4_WRITE_EVENT(name) \ DEFINE_EVENT(nfs4_write_event, name, \ TP_PROTO( \ - const struct nfs_write_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 73ce8d4fe2c8..939ae606cfa4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1556,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -1701,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4 encode_nfs4_verifier(xdr, &arg->confirm); } -static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -2451,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a READ request */ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2513,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a WRITE request */ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -5085,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_PUTROOTFH); } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_pgio_res *res) { __be32 *p; uint32_t count, eof, recvd; @@ -5339,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) { __be32 *p; int status; @@ -6636,7 +6639,7 @@ out: * Decode Read response */ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_readres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; @@ -6661,7 +6664,7 @@ out: * Decode WRITE response */ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_writeres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5457745dd4f1..611320753db2 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -int objio_read_pagelist(struct nfs_read_data *rdata) +int objio_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct objio_state *objios; @@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private) static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { struct objio_state *objios = priv; - struct nfs_write_data *wdata = objios->oir.rpcdata; + struct nfs_pgio_data *wdata = objios->oir.rpcdata; struct address_space *mapping = wdata->header->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; struct page *page; @@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = { .put_page = &__r4w_put_page, }; -int objio_write_pagelist(struct nfs_write_data *wdata, int how) +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; struct objio_state *objios; @@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) return 0; } -static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (!pnfs_generic_pg_test(pgio, prev, req)) - return false; + unsigned int size; + + size = pnfs_generic_pg_test(pgio, prev, req); + + if (!size || pgio->pg_count + req->wb_bytes > + (unsigned long)pgio->pg_layout_private) + return 0; - return pgio->pg_count + req->wb_bytes <= - (unsigned long)pgio->pg_layout_private; + return min(size, req->wb_bytes); } static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index e4f9cbfec67b..765d3f54e986 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct objlayout *objlay; objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (objlay) { - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - } + if (!objlay) + return NULL; + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); dprintk("%s: Return %p\n", __func__, objlay); return &objlay->pnfs_layout; } @@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, static void _rpc_read_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } @@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work) void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_read_data *rdata = oir->rpcdata; + struct nfs_pgio_data *rdata = oir->rpcdata; oir->status = rdata->task.tk_status = status; if (status >= 0) @@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async reads. */ enum pnfs_try_status -objlayout_read_pagelist(struct nfs_read_data *rdata) +objlayout_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; @@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) static void _rpc_write_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_write_done(wdata); } @@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work) void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata = oir->rpcdata; + struct nfs_pgio_data *wdata = oir->rpcdata; oir->status = wdata->task.tk_status = status; if (status >= 0) { @@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async writes. */ enum pnfs_try_status -objlayout_write_pagelist(struct nfs_write_data *wdata, +objlayout_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 87aa1dec6120..01e041029a6c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg); */ extern void objio_free_result(struct objlayout_io_res *oir); -extern int objio_read_pagelist(struct nfs_read_data *rdata); -extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); /* * callback API @@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( extern void objlayout_free_lseg(struct pnfs_layout_segment *); extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_read_data *); + struct nfs_pgio_data *); extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_write_data *, + struct nfs_pgio_data *, int how); extern void objlayout_encode_layoutcommit( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 03ed984ab4d8..b6ee3a6ee96d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -24,9 +24,14 @@ #include "internal.h" #include "pnfs.h" +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; + +static void nfs_free_request(struct nfs_page *); -bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -133,11 +138,156 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + smp_mb__before_atomic(); + clear_bit(PG_HEADLOCK, &head->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ + struct nfs_page *head = req->wb_head; + struct nfs_page *tmp; + + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + + tmp = req->wb_this_page; + while (tmp != req) { + if (!test_bit(bit, &tmp->wb_flags)) + return false; + tmp = tmp->wb_this_page; + } + + /* true! reset all bits */ + tmp = req; + do { + clear_bit(bit, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + + return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + * return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ + bool ret; + + nfs_page_group_lock(req); + ret = nfs_page_group_sync_on_bit_locked(req, bit); + nfs_page_group_unlock(req); + + return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + * or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ + WARN_ON_ONCE(prev == req); + + if (!prev) { + req->wb_head = req; + req->wb_this_page = req; + } else { + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); + req->wb_head = prev->wb_head; + req->wb_this_page = prev->wb_this_page; + prev->wb_this_page = req; + + /* grab extra ref if head request has extra ref from + * the write/commit path to handle handoff between write + * and commit lists */ + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) + kref_get(&req->wb_kref); + } +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *tmp, *next; + + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) + return; + + tmp = req; + do { + next = tmp->wb_this_page; + /* unlink and free */ + tmp->wb_this_page = tmp; + tmp->wb_head = tmp; + nfs_free_request(tmp); + tmp = next; + } while (tmp != req); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use - * @inode: inode to which the request is attached * @page: page to write + * @last: last nfs request created for this page group or NULL if head * @offset: starting offset within the page for the write * @count: number of bytes to read/write * @@ -146,9 +296,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c) * User should ensure it is safe to sleep in this function. */ struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - struct page *page, - unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, + struct nfs_page *last, unsigned int offset, + unsigned int count) { struct nfs_page *req; struct nfs_lock_context *l_ctx; @@ -180,6 +330,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); + nfs_page_group_init(req, last); return req; } @@ -237,16 +388,22 @@ static void nfs_clear_request(struct nfs_page *req) } } - /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct kref *kref) +static void nfs_free_request(struct nfs_page *req) { - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + WARN_ON_ONCE(req->wb_this_page != req); + + /* extra debug: make sure no sync bits are still set */ + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags)); /* Release struct file and open context */ nfs_clear_request(req); @@ -255,13 +412,7 @@ static void nfs_free_request(struct kref *kref) void nfs_release_request(struct nfs_page *req) { - kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; + kref_put(&req->wb_kref, nfs_page_group_destroy); } /** @@ -279,22 +430,249 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, struct nfs_page *req) { - /* - * FIXME: ideally we should be able to coalesce all requests - * that are not block boundary aligned, but currently this - * is problematic for the case of bsize < PAGE_CACHE_SIZE, - * since nfs_flush_multi and nfs_pagein_multi assume you - * can have only one struct nfs_page. - */ - if (desc->pg_bsize < PAGE_SIZE) + if (desc->pg_count > desc->pg_bsize) { + /* should never happen */ + WARN_ON_ONCE(1); return 0; + } - return desc->pg_count + req->wb_bytes <= desc->pg_bsize; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ + return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ + struct nfs_rw_header *header = ops->rw_alloc_header(); + + if (header) { + struct nfs_pgio_header *hdr = &header->header; + + INIT_LIST_HEAD(&hdr->pages); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); + hdr->rw_ops = ops; + } + return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ + hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_pgio_data *data, *prealloc; + + prealloc = &NFS_RW_HEADER(hdr)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; + } +out: + return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + + put_nfs_open_context(data->args.context); + if (data->pages.pagevec != data->pages.page_array) + kfree(data->pages.pagevec); + if (data == &pageio_header->rpc_data) { + data->header = NULL; + data = NULL; + } + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, + unsigned int count, unsigned int offset, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_page *req = data->header->req; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->args.fh = NFS_FH(data->header->inode); + data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pages.pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_reqs_to_commit(cinfo)) + break; + default: + data->args.stable = NFS_FILE_SYNC; + } + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + int err; + err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, + const struct rpc_call_ops *call_ops, int how, int flags) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->header->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + }; + int ret = 0; + + data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + + dprintk("NFS: %5u initiated pgio call " + "(req %s/%llu, %u bytes @ offset %llu)\n", + data->task.tk_pid, + data->header->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(data->header->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + nfs_pgio_data_release(hdr->data); + hdr->data = NULL; + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ + struct nfs_pgio_data *data = calldata; + if (data->header->rw_ops->rw_release) + data->header->rw_ops->rw_release(data); + nfs_pgio_data_release(data); +} + /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor @@ -307,6 +685,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, size_t bsize, int io_flags) { @@ -320,6 +699,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_completion_ops = compl_ops; + desc->pg_rw_ops = rw_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; @@ -328,6 +708,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, } EXPORT_SYMBOL_GPL(nfs_pageio_init); +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + struct inode *inode = data->header->inode; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, + task->tk_pid, task->tk_status); + + if (data->header->rw_ops->rw_done(task, data, inode) != 0) + return; + if (task->tk_status < 0) + nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); + else + data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_pgio_data *data; + struct list_head *head = &desc->pg_list; + struct nfs_commit_info cinfo; + + data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) + return nfs_pgio_error(desc, hdr); + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = data->pages.pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + *pages++ = req->wb_page; + } + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + hdr->data = data; + desc->pg_rpc_callops = &nfs_pgio_common_ops; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *rw_hdr; + struct nfs_pgio_header *hdr; + int ret; + + rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!rw_hdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &rw_hdr->header; + nfs_pgheader_init(desc, hdr, nfs_rw_header_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret == 0) + ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), + hdr->data, desc->pg_rpc_callops, + desc->pg_ioflags, 0); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} + static bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -356,18 +824,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - if (!nfs_match_open_context(req->wb_context, prev->wb_context)) - return false; - if (req->wb_context->dentry->d_inode->i_flock != NULL && - !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; - if (req->wb_pgbase != 0) - return false; - if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return false; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; - return pgio->pg_ops->pg_test(pgio, prev, req); + size_t size; + + if (prev) { + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + return false; + if (req->wb_context->dentry->d_inode->i_flock != NULL && + !nfs_match_lock_context(req->wb_lock_context, + prev->wb_lock_context)) + return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + } + size = pgio->pg_ops->pg_test(pgio, prev, req); + WARN_ON_ONCE(size > req->wb_bytes); + if (size && size < req->wb_bytes) + req->wb_bytes = size; + return size > 0; } /** @@ -381,17 +854,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { + struct nfs_page *prev = NULL; if (desc->pg_count != 0) { - struct nfs_page *prev; - prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); desc->pg_count += req->wb_bytes; @@ -421,22 +893,73 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * @desc: destination io descriptor * @req: request * + * This may split a request into subrequests which are all part of the + * same page group. + * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - while (!nfs_pageio_do_add_request(desc, req)) { - desc->pg_moreio = 1; - nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - desc->pg_moreio = 0; - if (desc->pg_recoalesce) - return 0; - } + struct nfs_page *subreq; + unsigned int bytes_left = 0; + unsigned int offset, pgbase; + + nfs_page_group_lock(req); + + subreq = req; + bytes_left = subreq->wb_bytes; + offset = subreq->wb_offset; + pgbase = subreq->wb_pgbase; + + do { + if (!nfs_pageio_do_add_request(desc, subreq)) { + /* make sure pg_test call(s) did nothing */ + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); + WARN_ON_ONCE(subreq->wb_offset != offset); + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + + nfs_page_group_unlock(req); + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + desc->pg_moreio = 0; + if (desc->pg_recoalesce) + return 0; + /* retry add_request for this subreq */ + nfs_page_group_lock(req); + continue; + } + + /* check for buggy pg_test call(s) */ + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); + WARN_ON_ONCE(subreq->wb_bytes == 0); + + bytes_left -= subreq->wb_bytes; + offset += subreq->wb_bytes; + pgbase += subreq->wb_bytes; + + if (bytes_left) { + subreq = nfs_create_request(req->wb_context, + req->wb_page, + subreq, pgbase, bytes_left); + if (IS_ERR(subreq)) + goto err_ptr; + nfs_lock_request(subreq); + subreq->wb_offset = offset; + subreq->wb_index = req->wb_index; + } + } while (bytes_left > 0); + + nfs_page_group_unlock(req); return 1; +err_ptr: + desc->pg_error = PTR_ERR(subreq); + nfs_page_group_unlock(req); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -535,3 +1058,13 @@ void nfs_destroy_nfspagecache(void) kmem_cache_destroy(nfs_page_cachep); } +static const struct rpc_call_ops nfs_pgio_common_ops = { + .rpc_call_prepare = nfs_pgio_prepare, + .rpc_call_done = nfs_pgio_result, + .rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fd9536e494bc..6fdcd233d6f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1388,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_read_mds(pgio); - return; - } - if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else @@ -1417,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, { WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_write_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), @@ -1434,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_read(pgio, inode, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); -} - -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, - int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); -} - -bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_lseg == NULL) - return nfs_generic_pg_test(pgio, prev, req); + unsigned int size; + u64 seg_end, req_start, seg_left; + + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; /* - * Test if a nfs_page is fully contained in the pnfs_layout_range. - * Note that this test makes several assumptions: - * - that the previous nfs_page in the struct nfs_pageio_descriptor - * is known to lie within the range. - * - that the nfs_page being tested is known to be contiguous with the - * previous nfs_page. - * - Layout ranges are page aligned, so we only have to test the - * start offset of the request. + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. * * Please also note that 'end_offset' is actually the offset of the * first byte that lies outside the pnfs_layout_range. FIXME? * */ - return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length); + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start > seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) + return 0; + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); @@ -1496,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1519,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1538,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1554,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done); static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1563,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); desc->pg_recoalesce = 1; } - nfs_writedata_release(data); + nfs_pgio_data_release(data); } static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_data *wdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg, int how) @@ -1589,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, } static void -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) { - struct nfs_write_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_write_through_mds(desc, data); - } + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_writehdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_write_header *whdr; + struct nfs_rw_header *whdr; struct nfs_pgio_header *hdr; int ret; - whdr = nfs_writehdr_alloc(); + whdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!whdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); pnfs_put_lseg(desc->pg_lseg); @@ -1634,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); + pnfs_do_write(desc, hdr, desc->pg_ioflags); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; @@ -1655,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read(&pgio, inode, compl_ops); + nfs_pageio_init_read(&pgio, inode, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1674,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1693,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1709,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done); static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1718,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); desc->pg_recoalesce = 1; } - nfs_readdata_release(data); + nfs_pgio_data_release(data); } /* * Call the appropriate parallel I/O subsystem read function. */ static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_data *rdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { @@ -1747,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, } static void -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - struct nfs_read_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_read_through_mds(desc, data); - } + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_readhdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_read_header *rhdr; + struct nfs_rw_header *rhdr; struct nfs_pgio_header *hdr; int ret; - rhdr = nfs_readhdr_alloc(); + rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!rhdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; @@ -1793,12 +1765,12 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_reads(desc, &hdr->rpc_list); + pnfs_do_read(desc, hdr); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; @@ -1848,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_write_data *wdata) +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index c3058a076596..4fb309a2b4c4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type { * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS */ - enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); - enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); + enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); void (*free_deviceid_node) (struct nfs4_deviceid_node *); @@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, - const struct nfs_pgio_completion_ops *); -void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, - int, const struct nfs_pgio_completion_ops *); - void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); @@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); -bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); -void pnfs_ld_write_done(struct nfs_write_data *); -void pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -461,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_read(pgio, inode, compl_ops); -} - -static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); -} - static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e55ce9e8b034..c171ce1a8a30 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -578,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -594,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -614,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ data->args.stable = NFS_FILE_SYNC; msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } -static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { BUG(); @@ -734,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .fsinfo = nfs_proc_fsinfo, .pathconf = nfs_proc_pathconf, .decode_dirent = nfs2_decode_dirent, + .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare, .read_setup = nfs_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs_proc_read_rpc_prepare, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 411aedda14bb..e818a475ca64 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,85 +24,24 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pageio_ops nfs_pageio_read_ops; -static const struct rpc_call_ops nfs_read_common_ops; static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; -struct nfs_read_header *nfs_readhdr_alloc(void) +static struct nfs_rw_header *nfs_readhdr_alloc(void) { - struct nfs_read_header *rhdr; - - rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); - if (rhdr) { - struct nfs_pgio_header *hdr = &rhdr->header; - - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - } - return rhdr; + return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(nfs_readhdr_alloc); -static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr) { - struct nfs_read_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -void nfs_readhdr_free(struct nfs_pgio_header *hdr) -{ - struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header); - kmem_cache_free(nfs_rdata_cachep, rhdr); } -EXPORT_SYMBOL_GPL(nfs_readhdr_free); - -void nfs_readdata_release(struct nfs_read_data *rdata) -{ - struct nfs_pgio_header *hdr = rdata->header; - struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header); - - put_nfs_open_context(rdata->args.context); - if (rdata->pages.pagevec != rdata->pages.page_array) - kfree(rdata->pages.pagevec); - if (rdata == &read_header->rpc_data) { - rdata->header = NULL; - rdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(rdata); -} -EXPORT_SYMBOL_GPL(nfs_readdata_release); static int nfs_return_empty_page(struct page *page) @@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page) } void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, - NFS_SERVER(inode)->rsize, 0); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); @@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, inode, page, 0, len); + new = nfs_create_request(ctx, page, NULL, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); NFS_I(inode)->read_io += pgio.pg_bytes_written; @@ -158,10 +105,16 @@ static void nfs_readpage_release(struct nfs_page *req) { struct inode *d_inode = req->wb_context->dentry->d_inode; - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, + (long long)req_offset(req)); - unlock_page(req->wb_page); + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, @@ -171,7 +124,12 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -/* Note io was page aligned */ +static void nfs_page_group_set_uptodate(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + SetPageUptodate(req->wb_page); +} + static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; + unsigned long start = req->wb_pgbase; + unsigned long end = req->wb_pgbase + req->wb_bytes; if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { - if (bytes > hdr->good_bytes) - zero_user(page, 0, PAGE_SIZE); - else if (hdr->good_bytes - bytes < PAGE_SIZE) - zero_user_segment(page, - hdr->good_bytes & ~PAGE_MASK, - PAGE_SIZE); + /* note: regions of the page not covered by a + * request are zeroed in nfs_readpage_async / + * readpage_async_filler */ + if (bytes > hdr->good_bytes) { + /* nothing in this request was good, so zero + * the full extent of the request */ + zero_user_segment(page, start, end); + + } else if (hdr->good_bytes - bytes < req->wb_bytes) { + /* part of this request has good bytes, but + * not all. zero the bad bytes */ + start += hdr->good_bytes - bytes; + WARN_ON(start < req->wb_pgbase); + zero_user_segment(page, start, end); + } } bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); } else - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); nfs_readpage_release(req); } @@ -203,95 +172,14 @@ out: hdr->release(hdr); } -int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = clnt, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | swap_flags | flags, - }; - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ " - "offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; -} -EXPORT_SYMBOL_GPL(nfs_initiate_read); - -/* - * Set up the NFS read request struct - */ -static void nfs_read_rpcsetup(struct nfs_read_data *data, - unsigned int count, unsigned int offset) -{ - struct nfs_page *req = data->header->req; - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_read(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0); -} - -static int -nfs_do_multiple_reads(struct list_head *head, - const struct rpc_call_ops *call_ops) -{ - struct nfs_read_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_read(data, call_ops); - if (ret == 0) - ret = ret2; - } - return ret; + task_setup_data->flags |= swap_flags; + NFS_PROTO(inode)->read_setup(data, msg); } static void @@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .completion = nfs_read_completion, }; -static void nfs_pagein_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_read_data, list); - list_del(&data->list); - nfs_readdata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire. If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated. This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_read_data *data; - size_t rsize = desc->pg_bsize, nbytes; - unsigned int offset; - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes,rsize); - - data = nfs_readdata_alloc(hdr, 1); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_read_rpcsetup(data, len, offset); - list_add(&data->list, &hdr->rpc_list); - nbytes -= len; - offset += len; - } while (nbytes != 0); - - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_read_data *data; - struct list_head *head = &desc->pg_list; - - data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - nfs_read_rpcsetup(data, desc->pg_count, 0); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_pagein_multi(desc, hdr); - return nfs_pagein_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_pagein); - -static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_read_header *rhdr; - struct nfs_pgio_header *hdr; - int ret; - - rhdr = nfs_readhdr_alloc(); - if (!rhdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &rhdr->header; - nfs_pgheader_init(desc, hdr, nfs_readhdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_reads(&hdr->rpc_list, - desc->pg_rpc_callops); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_read_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_readpages, -}; - /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct inode *inode = data->header->inode; - int status; - - dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, - task->tk_status); - - status = NFS_PROTO(inode)->read_done(task, data); + int status = NFS_PROTO(inode)->read_done(task, data); if (status != 0) return status; @@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_readargs *argp = &data->args; - struct nfs_readres *resp = &data->res; + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; /* This is a short read! */ nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); @@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data rpc_restart_call_prepare(task); } -static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_read_data *data = calldata; struct nfs_pgio_header *hdr = data->header; - /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */ - if (nfs_readpage_result(task, data) != 0) - return; - if (task->tk_status < 0) - nfs_set_pgio_error(hdr, task->tk_status, data->args.offset); - else if (data->res.eof) { + if (data->res.eof) { loff_t bound; bound = data->args.offset + data->res.count; @@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) nfs_readpage_retry(task, data); } -static void nfs_readpage_release_common(void *calldata) -{ - nfs_readdata_release(calldata); -} - -void nfs_read_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - -static const struct rpc_call_ops nfs_read_common_ops = { - .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_readpage_result_common, - .rpc_release = nfs_readpage_release_common, -}; - /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -592,7 +325,6 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page_file_mapping(page)->host; struct nfs_page *new; unsigned int len; int error; @@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page) if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, inode, page, 0, len); + new = nfs_create_request(desc->ctx, page, NULL, 0, len); if (IS_ERR(new)) goto out_error; @@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); @@ -671,7 +404,7 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_read_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) @@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void) { kmem_cache_destroy(nfs_rdata_cachep); } + +static const struct nfs_rw_ops nfs_rw_read_ops = { + .rw_mode = FMODE_READ, + .rw_alloc_header = nfs_readhdr_alloc, + .rw_free_header = nfs_readhdr_free, + .rw_done = nfs_readpage_done, + .rw_result = nfs_readpage_result, + .rw_initiate = nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2cb56943e232..084af1060d79 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2180,11 +2180,23 @@ out_no_address: return -EINVAL; } +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_UNSHARED \ + | NFS_MOUNT_NORESVPORT \ + | NFS_MOUNT_LEGACY_INTERFACE) + static int nfs_compare_remount_data(struct nfs_server *nfss, struct nfs_parsed_mount_data *data) { - if (data->flags != nfss->flags || + if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || data->rsize != nfss->rsize || data->wsize != nfss->wsize || data->version != nfss->nfs_client->rpc_ops->version || @@ -2248,6 +2260,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; data->version = nfsvers; data->minorversion = nfss->nfs_client->cl_minorversion; + data->net = current->nsproxy->net_ns; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); @@ -2347,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) nfs_initialise_sb(sb); } -#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ - | NFS_MOUNT_SECURE \ - | NFS_MOUNT_TCP \ - | NFS_MOUNT_VER3 \ - | NFS_MOUNT_KERBEROS \ - | NFS_MOUNT_NONLM \ - | NFS_MOUNT_BROKEN_SUID \ - | NFS_MOUNT_STRICTLOCK \ - | NFS_MOUNT_UNSHARED \ - | NFS_MOUNT_NORESVPORT \ - | NFS_MOUNT_LEGACY_INTERFACE) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ffb9459f180b..3ee5af4e738e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -42,10 +42,10 @@ * Local function declarations */ static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_common_ops; static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -70,76 +70,19 @@ void nfs_commit_free(struct nfs_commit_data *p) } EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_header *nfs_writehdr_alloc(void) +static struct nfs_rw_header *nfs_writehdr_alloc(void) { - struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - - if (p) { - struct nfs_pgio_header *hdr = &p->header; + struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + if (p) memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - hdr->verf = &p->verf; - } return p; } -EXPORT_SYMBOL_GPL(nfs_writehdr_alloc); - -static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) -{ - struct nfs_write_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} -void nfs_writehdr_free(struct nfs_pgio_header *hdr) +static void nfs_writehdr_free(struct nfs_rw_header *whdr) { - struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header); mempool_free(whdr, nfs_wdata_mempool); } -EXPORT_SYMBOL_GPL(nfs_writehdr_free); - -void nfs_writedata_release(struct nfs_write_data *wdata) -{ - struct nfs_pgio_header *hdr = wdata->header; - struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header); - - put_nfs_open_context(wdata->args.context); - if (wdata->pages.pagevec != wdata->pages.page_array) - kfree(wdata->pages.pagevec); - if (wdata == &write_header->rpc_data) { - wdata->header = NULL; - wdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(wdata); -} -EXPORT_SYMBOL_GPL(nfs_writedata_release); static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) { @@ -211,18 +154,78 @@ static void nfs_set_pageerror(struct page *page) nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ + struct nfs_page *req; + + WARN_ON_ONCE(head != head->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + + req = head; + do { + if (page_offset >= req->wb_pgbase && + page_offset < (req->wb_pgbase + req->wb_bytes)) + return req; + + req = req->wb_this_page; + } while (req != head); + + return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ + struct nfs_page *tmp; + unsigned int pos = 0; + unsigned int len = nfs_page_length(req->wb_page); + + nfs_page_group_lock(req); + + do { + tmp = nfs_page_group_search_locked(req->wb_head, pos); + if (tmp) { + /* no way this should happen */ + WARN_ON_ONCE(tmp->wb_pgbase != pos); + pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); + } + } while (tmp && pos < len); + + nfs_page_group_unlock(req); + WARN_ON_ONCE(pos > len); + return pos == len; +} + /* We can set the PG_uptodate flag if we see that a write request * covers the full page. */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(page)) - return; - if (base != 0) + if (PageUptodate(req->wb_page)) return; - if (count != nfs_page_length(page)) + if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(page); + SetPageUptodate(req->wb_page); } static int wb_priority(struct writeback_control *wbc) @@ -258,12 +261,15 @@ static void nfs_set_page_writeback(struct page *page) } } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); - end_page_writeback(page); + if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + return; + + end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } @@ -354,10 +360,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, - page->mapping->host, - wb_priority(wbc), - &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -400,7 +404,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); @@ -425,6 +430,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); + WARN_ON_ONCE(req->wb_this_page != req); + /* Lock the request! */ nfs_lock_request(req); @@ -441,6 +448,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; + set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -452,15 +460,20 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *head; - spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(req->wb_page))) { - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - clear_bit(PG_MAPPED, &req->wb_flags); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + head = req->wb_head; + + spin_lock(&inode->i_lock); + if (likely(!PageSwapCache(head->wb_page))) { + set_page_private(head->wb_page, 0); + ClearPagePrivate(head->wb_page); + clear_bit(PG_MAPPED, &head->wb_flags); + } + nfsi->npages--; + spin_unlock(&inode->i_lock); } - nfsi->npages--; - spin_unlock(&inode->i_lock); nfs_release_request(req); } @@ -583,7 +596,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { if (data->verf.committed == NFS_DATA_SYNC) return data->header->lseg == NULL; @@ -614,7 +627,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { return 0; } @@ -625,6 +638,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; unsigned long bytes = 0; + bool do_destroy; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; @@ -645,7 +659,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { - memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); + memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -653,7 +667,8 @@ remove_req: nfs_inode_remove_request(req); next: nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags); nfs_release_request(req); } out: @@ -661,7 +676,7 @@ out: } #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) -static unsigned long +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return cinfo->mds->ncommit; @@ -718,7 +733,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, } #else -static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return 0; } @@ -758,6 +773,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, if (req == NULL) goto out_unlock; + /* should be handled by nfs_flush_incompatible */ + WARN_ON_ONCE(req->wb_head != req); + WARN_ON_ONCE(req->wb_this_page != req); + rqend = req->wb_offset + req->wb_bytes; /* * Tell the caller to flush out the request if @@ -819,7 +838,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, req = nfs_try_to_update_request(inode, page, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, inode, page, offset, bytes); + req = nfs_create_request(ctx, page, NULL, offset, bytes); if (IS_ERR(req)) goto out; nfs_inode_add_request(inode, req); @@ -837,7 +856,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, return PTR_ERR(req); /* Update file length */ nfs_grow_file(page, offset, count); - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; @@ -863,6 +882,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) return 0; l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || req->wb_context != ctx; + /* for now, flush if more than 1 request in page_group */ + do_flush |= req->wb_this_page != req; if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { do_flush |= l_ctx->lockowner.l_owner != current->files || l_ctx->lockowner.l_pid != current->tgid; @@ -990,126 +1011,17 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags) +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = clnt, - .task = &data->task, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, - .priority = priority, - }; - int ret = 0; - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - dprintk("NFS: %5u initiated write call " - "(req %s/%llu, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + task_setup_data->priority = priority; + NFS_PROTO(inode)->write_setup(data, msg); nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, - &task_setup_data.rpc_client, &msg, data); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; -} -EXPORT_SYMBOL_GPL(nfs_initiate_write); - -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_write_rpcsetup(struct nfs_write_data *data, - unsigned int count, unsigned int offset, - int how, struct nfs_commit_info *cinfo) -{ - struct nfs_page *req = data->header->req; - - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - /* pnfs_set_layoutcommit needs this */ - data->mds_offset = data->args.offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - data->args.stable = NFS_UNSTABLE; - switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - case 0: - break; - case FLUSH_COND_STABLE: - if (nfs_reqs_to_commit(cinfo)) - break; - default: - data->args.stable = NFS_FILE_SYNC; - } - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_write(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0); -} - -static int nfs_do_multiple_writes(struct list_head *head, - const struct rpc_call_ops *call_ops, - int how) -{ - struct nfs_write_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_write(data, call_ops, how); - if (ret == 0) - ret = ret2; - } - return ret; + &task_setup_data->rpc_client, msg, data); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1120,7 +1032,7 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1140,173 +1052,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .completion = nfs_write_completion, }; -static void nfs_flush_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_write_data, list); - list_del(&data->list); - nfs_writedata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. - */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_write_data *data; - size_t wsize = desc->pg_bsize, nbytes; - unsigned int offset; - int requests = 0; - struct nfs_commit_info cinfo; - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) || - desc->pg_count > wsize)) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes, wsize); - - data = nfs_writedata_alloc(hdr, 1); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - requests++; - nbytes -= len; - offset += len; - } while (nbytes != 0); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_write_data *data; - struct list_head *head = &desc->pg_list; - struct nfs_commit_info cinfo; - - data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - /* Set up the argument struct */ - nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_flush_multi(desc, hdr); - return nfs_flush_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_flush); - -static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_write_header *whdr; - struct nfs_pgio_header *hdr; - int ret; - - whdr = nfs_writehdr_alloc(); - if (!whdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &whdr->header; - nfs_pgheader_init(desc, hdr, nfs_writehdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_writes(&hdr->rpc_list, - desc->pg_rpc_callops, - desc->pg_ioflags); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_write_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_writepages, -}; - void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, - NFS_SERVER(inode)->wsize, ioflags); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -void nfs_write_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - void nfs_commit_prepare(struct rpc_task *task, void *calldata) { struct nfs_commit_data *data = calldata; @@ -1314,23 +1083,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - * writebacks since the page->count is kept > 1 for as long - * as the page has a write request pending. - */ -static void nfs_writeback_done_common(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - - nfs_writeback_done(task, data); -} - -static void nfs_writeback_release_common(void *calldata) +static void nfs_writeback_release_common(struct nfs_pgio_data *data) { - struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; @@ -1339,34 +1093,46 @@ static void nfs_writeback_release_common(void *calldata) if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); - else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) + memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); + else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } - nfs_writedata_release(data); } -static const struct rpc_call_ops nfs_write_common_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_writeback_done_common, - .rpc_release = nfs_writeback_release_common, -}; +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} /* * This function is called when the WRITE call is complete. */ -void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; - struct inode *inode = data->header->inode; int status; - dprintk("NFS: %5u nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); - /* * ->write_done will attempt to use post-op attributes to detect * conflicting writes by other clients. A strict interpretation @@ -1376,11 +1142,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(inode)->write_done(task, data); if (status != 0) - return; - nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) - if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we * requested it. @@ -1396,18 +1162,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(inode)->nfs_client->cl_hostname, - resp->verf->committed, argp->stable); + data->res.verf->committed, data->args.stable); complain = jiffies + 300 * HZ; } } #endif - if (task->tk_status < 0) - nfs_set_pgio_error(data->header, task->tk_status, argp->offset); - else if (resp->count < argp->count) { + + /* Deal with the suid/sgid bit corner case */ + if (nfs_should_remove_suid(inode)) + nfs_mark_for_revalidate(inode); + return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; + + if (resp->count < argp->count) { static unsigned long complain; /* This a short write! */ - nfs_inc_stats(inode, NFSIOS_SHORTWRITE); + nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ if (resp->count == 0) { @@ -1874,7 +1653,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_write_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) @@ -1936,3 +1715,12 @@ void nfs_destroy_writepagecache(void) kmem_cache_destroy(nfs_wdata_cachep); } +static const struct nfs_rw_ops nfs_rw_write_ops = { + .rw_mode = FMODE_WRITE, + .rw_alloc_header = nfs_writehdr_alloc, + .rw_free_header = nfs_writehdr_free, + .rw_release = nfs_writeback_release_common, + .rw_done = nfs_writeback_done, + .rw_result = nfs_writeback_result, + .rw_initiate = nfs_initiate_write, +}; diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index b481e1f5eecc..a986ceb6fd0d 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -49,7 +49,7 @@ struct svc_rqst; struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); -__be32 nfs4_acl_write_who(int who, __be32 **p, int *len); +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 2645be435e75..72f44823adbb 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,7 +1,6 @@ /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> -#include <linux/user_namespace.h> #include "nfsd.h" #include "auth.h" @@ -25,7 +24,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) struct cred *new; int i; int flags = nfsexp_flags(rqstp, exp); - int ret; validate_process_creds(); @@ -86,8 +84,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) return 0; oom: - ret = -ENOMEM; abort_creds(new); - return ret; + return -ENOMEM; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8513c598fabf..13b85f94d9e2 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -17,17 +17,12 @@ #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> -#include <net/ipv6.h> - #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT -typedef struct auth_domain svc_client; -typedef struct svc_export svc_export; - /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map @@ -73,7 +68,7 @@ static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { - /* client fsidtype fsid [path] */ + /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; @@ -295,13 +290,19 @@ svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { + struct nfsd4_fs_location *locations = fsloc->locations; int i; + if (!locations) + return; + for (i = 0; i < fsloc->locations_count; i++) { - kfree(fsloc->locations[i].path); - kfree(fsloc->locations[i].hosts); + kfree(locations[i].path); + kfree(locations[i].hosts); } - kfree(fsloc->locations); + + kfree(locations); + fsloc->locations = NULL; } static void svc_export_put(struct kref *ref) @@ -388,6 +389,10 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int len; int migrated, i, err; + /* more than one fsloc */ + if (fsloc->locations) + return -EINVAL; + /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) @@ -437,13 +442,18 @@ out_free_all: static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { - int listsize, err; struct exp_flavor_info *f; + u32 listsize; + int err; + + /* more than one secinfo */ + if (exp->ex_nflavors) + return -EINVAL; - err = get_int(mesg, &listsize); + err = get_uint(mesg, &listsize); if (err) return err; - if (listsize < 0 || listsize > MAX_SECINFO_LIST) + if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { @@ -474,6 +484,27 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static inline int +uuid_parse(char **mesg, char *buf, unsigned char **puuid) +{ + int len; + + /* more than one uuid */ + if (*puuid) + return -EINVAL; + + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != EX_UUID_LEN) + return -EINVAL; + + *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); + if (*puuid == NULL) + return -ENOMEM; + + return 0; +} + static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ @@ -552,18 +583,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); - else if (strcmp(buf, "uuid") == 0) { - /* expect a 16 byte uuid encoded as \xXXXX... */ - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len != 16) - err = -EINVAL; - else { - exp.ex_uuid = - kmemdup(buf, 16, GFP_KERNEL); - if (exp.ex_uuid == NULL) - err = -ENOMEM; - } - } else if (strcmp(buf, "secinfo") == 0) + else if (strcmp(buf, "uuid") == 0) + err = uuid_parse(&mesg, buf, &exp.ex_uuid); + else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything @@ -649,7 +671,7 @@ static int svc_export_show(struct seq_file *m, if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); - for (i=0; i<16; i++) { + for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); @@ -771,7 +793,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old) static struct svc_expkey * -exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, +exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; @@ -793,9 +815,9 @@ exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, return ek; } - -static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, - const struct path *path, struct cache_req *reqp) +static struct svc_export * +exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, + const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; @@ -819,11 +841,11 @@ static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, /* * Find the export entry for a given dentry. */ -static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, - struct path *path) +static struct svc_export * +exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); - svc_export *exp = exp_get_by_name(cd, clp, path, NULL); + struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); @@ -844,7 +866,7 @@ static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, * since its harder to fool a kernel module than a user space program. */ int -exp_rootfh(struct net *net, svc_client *clp, char *name, +exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; diff --git a/include/linux/nfsd/export.h b/fs/nfsd/export.h index 7898c997dfea..cfeea85c5bed 100644 --- a/include/linux/nfsd/export.h +++ b/fs/nfsd/export.h @@ -1,17 +1,16 @@ /* - * include/linux/nfsd/export.h - * - * Public declarations for NFS exports. The definitions for the - * syscall interface are in nfsctl.h - * * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> */ #ifndef NFSD_EXPORT_H #define NFSD_EXPORT_H -# include <linux/nfsd/nfsfh.h> +#include <linux/sunrpc/cache.h> #include <uapi/linux/nfsd/export.h> +struct knfsd_fh; +struct svc_fh; +struct svc_rqst; + /* * FS Locations */ @@ -38,6 +37,7 @@ struct nfsd4_fs_locations { * spkm3i, and spkm3p (and using all 8 at once should be rare). */ #define MAX_SECINFO_LIST 8 +#define EX_UUID_LEN 16 struct exp_flavor_info { u32 pseudoflavor; @@ -54,7 +54,7 @@ struct svc_export { int ex_fsid; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; - int ex_nflavors; + uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; struct cache_detail *cd; }; diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index d620e7f81429..2ed05c3cd43d 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -97,25 +97,14 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, { static u64 val; char read_buf[25]; - size_t size, ret; + size_t size; loff_t pos = *ppos; if (!pos) nfsd_inject_get(file_inode(file)->i_private, &val); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - if (pos < 0) - return -EINVAL; - if (pos >= size || !len) - return 0; - if (len > size - pos) - len = size - pos; - ret = copy_to_user(buf, read_buf + pos, len); - if (ret == len) - return -EFAULT; - len -= ret; - *ppos = pos + len; - return len; + return simple_read_from_buffer(buf, len, ppos, read_buf, size); } static ssize_t fault_inject_write(struct file *file, const char __user *buf, diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index 66e58db01936..a3f34900091f 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net) __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); -__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *); -__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *); +__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t); +__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 11c1fba29312..12b023a7ab7d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -182,7 +182,8 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p); p++; @@ -197,7 +198,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p++); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || @@ -218,7 +220,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -226,7 +229,8 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->access = ntohl(*p++); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index adc5f1b1dc26..2a514e21dc74 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -128,7 +128,8 @@ out: static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *args) { - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p); p++; @@ -143,7 +144,8 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p++); if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index de6e39e12cb3..e6c01e80325e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -278,7 +278,8 @@ void fill_post_wcc(struct svc_fh *fhp) int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -287,7 +288,8 @@ int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_sattrargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = decode_sattr3(p, &args->attrs); @@ -315,7 +317,8 @@ int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->access = ntohl(*p++); @@ -330,7 +333,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, int v; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -360,7 +364,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -535,7 +540,8 @@ int nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -558,7 +564,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -580,7 +587,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, int len; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -605,7 +613,8 @@ int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_commitargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); args->count = ntohl(*p++); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index f66c66b9f182..d714156a19fd 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> -#include <linux/export.h> #include "nfsfh.h" #include "nfsd.h" #include "acl.h" @@ -920,20 +919,19 @@ nfs4_acl_get_whotype(char *p, u32 len) return NFS4_ACL_WHO_NAMED; } -__be32 nfs4_acl_write_who(int who, __be32 **p, int *len) +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) { + __be32 *p; int i; - int bytes; for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { if (s2t_map[i].type != who) continue; - bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2); - if (bytes > *len) + p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, s2t_map[i].string, + p = xdr_encode_opaque(p, s2t_map[i].string, s2t_map[i].stringlen); - *len -= bytes; return 0; } WARN_ON_ONCE(1); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index c0dfde68742e..a0ab0a847d69 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -551,44 +551,43 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return 0; } -static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen) +static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) { char buf[11]; int len; - int bytes; + __be32 *p; len = sprintf(buf, "%u", id); - bytes = 4 + (XDR_QUADLEN(len) << 2); - if (bytes > *buflen) + p = xdr_reserve_space(xdr, len + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, buf, len); - *buflen -= bytes; + p = xdr_encode_opaque(p, buf, len); return 0; } -static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) +static __be32 idmap_id_to_name(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; + __be32 *p; int ret; - int bytes; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) - return encode_ascii_id(id, p, buflen); + return encode_ascii_id(xdr, id); if (ret) return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); - bytes = 4 + (XDR_QUADLEN(ret) << 2); - if (bytes > *buflen) + p = xdr_reserve_space(xdr, ret + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, item->name, ret); - *buflen -= bytes; + p = xdr_encode_opaque(p, item->name, ret); cache_put(&item->h, nn->idtoname_cache); return 0; } @@ -622,11 +621,12 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u return idmap_name_to_id(rqstp, type, name, namelen, id); } -static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) +static __be32 encode_name_from_id(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) - return encode_ascii_id(id, p, buflen); - return idmap_id_to_name(rqstp, type, id, p, buflen); + return encode_ascii_id(xdr, id); + return idmap_id_to_name(xdr, rqstp, type, id); } __be32 @@ -655,14 +655,16 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return status; } -__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen) +__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kuid_t uid) { u32 id = from_kuid(&init_user_ns, uid); - return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } -__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen) +__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kgid_t gid) { u32 id = from_kgid(&init_user_ns, gid); - return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d543222babf3..6851b003f2a4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -430,12 +430,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion, nn); if (status) goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); @@ -445,7 +445,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -786,7 +785,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfsd4_last_compound_op(rqstp)) rqstp->rq_splice_ok = false; - nfs4_lock_state(); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &read->rd_stateid, @@ -794,11 +792,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } - if (read->rd_filp) - get_file(read->rd_filp); status = nfs_ok; out: - nfs4_unlock_state(); read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -937,10 +932,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &setattr->sa_stateid, WR_STATE, NULL); - nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -1006,17 +999,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, stateid, WR_STATE, &filp); if (status) { - nfs4_unlock_state(); dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; } - if (filp) - get_file(filp); - nfs4_unlock_state(); cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; @@ -1072,10 +1060,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_jukebox; p = buf; - status = nfsd4_encode_fattr(&cstate->current_fh, + status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh, cstate->current_fh.fh_export, - cstate->current_fh.fh_dentry, &p, - count, verify->ve_bmval, + cstate->current_fh.fh_dentry, + verify->ve_bmval, rqstp, 0); /* * If nfsd4_encode_fattr() ran out of space, assume that's because @@ -1182,9 +1170,7 @@ struct nfsd4_operation { static struct nfsd4_operation nfsd4_ops[]; -#ifdef NFSD_DEBUG static const char *nfsd4_op_name(unsigned opnum); -#endif /* * Enforce NFSv4.1 COMPOUND ordering rules: @@ -1226,6 +1212,8 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { + if (op->opnum == OP_ILLEGAL) + return false; return OPDESC(op)->op_flags & OP_CACHEME; } @@ -1262,6 +1250,25 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } +static void svcxdr_init_encode(struct svc_rqst *rqstp, + struct nfsd4_compoundres *resp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + + xdr->buf = buf; + xdr->iov = head; + xdr->p = head->iov_base + head->iov_len; + xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; + /* Tail and page_len should be zero at this point: */ + buf->len = buf->head[0].iov_len; + xdr->scratch.iov_len = 0; + xdr->page_ptr = buf->pages - 1; + buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) + - rqstp->rq_auth_slack; +} + /* * COMPOUND call. */ @@ -1275,24 +1282,16 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate = &resp->cstate; struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; - int slack_bytes; - u32 plen = 0; __be32 status; - resp->xbuf = &rqstp->rq_res; - resp->p = rqstp->rq_res.head[0].iov_base + - rqstp->rq_res.head[0].iov_len; - resp->tagp = resp->p; + svcxdr_init_encode(rqstp, resp); + resp->tagp = resp->xdr.p; /* reserve space for: taglen, tag, and opcnt */ - resp->p += 2 + XDR_QUADLEN(args->taglen); - resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE; + xdr_reserve_space(&resp->xdr, 8 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; - resp->opcnt = 0; resp->rqstp = rqstp; cstate->minorversion = args->minorversion; - cstate->replay_owner = NULL; - cstate->session = NULL; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); /* @@ -1332,19 +1331,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } - /* We must be able to encode a successful response to - * this operation, with enough room left over to encode a - * failed response to the next operation. If we don't - * have enough room, fail with ERR_RESOURCE. - */ - slack_bytes = (char *)resp->end - (char *)resp->p; - if (slack_bytes < COMPOUND_SLACK_SPACE - + COMPOUND_ERR_SLACK_SPACE) { - BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE); - op->status = nfserr_resource; - goto encode_op; - } - opdesc = OPDESC(op); if (!current_fh->fh_dentry) { @@ -1362,9 +1348,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, /* If op is non-idempotent */ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { - plen = opdesc->op_rsize_bop(rqstp, op); /* - * If there's still another operation, make sure + * Don't execute this op if we couldn't encode a + * succesful reply: + */ + u32 plen = opdesc->op_rsize_bop(rqstp, op); + /* + * Plus if there's another operation, make sure * we'll have space to at least encode an error: */ if (resp->opcnt < args->opcnt) @@ -1399,7 +1389,7 @@ encode_op: } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(resp, op); + nfsd4_encode_replay(&resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); @@ -1438,7 +1428,8 @@ out: #define op_encode_change_info_maxsz (5) #define nfs4_fattr_bitmap_maxsz (4) -#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +/* We'll fall back on returning no lockowner if run out of space: */ +#define op_encode_lockowner_maxsz (0) #define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -1470,6 +1461,49 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +/* + * Note since this is an idempotent operation we won't insist on failing + * the op prematurely if the estimate is too large. We may turn off splice + * reads unnecessarily. + */ +static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 *bmap = op->u.getattr.ga_bmval; + u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; + u32 ret = 0; + + if (bmap0 & FATTR4_WORD0_ACL) + return svc_max_payload(rqstp); + if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) + return svc_max_payload(rqstp); + + if (bmap1 & FATTR4_WORD1_OWNER) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER; + } + if (bmap1 & FATTR4_WORD1_OWNER_GROUP) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER_GROUP; + } + if (bmap0 & FATTR4_WORD0_FILEHANDLE) { + ret += NFS4_FHSIZE + 4; + bmap0 &= ~FATTR4_WORD0_FILEHANDLE; + } + if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { + ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; + } + /* + * Largest of remaining attributes are 16 bytes (e.g., + * supported_attributes) + */ + ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2)); + /* bitmask, length */ + ret += 20; + return ret; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1500,18 +1534,19 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) if (rlen > maxcount) rlen = maxcount; - return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; + return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { + u32 maxcount = svc_max_payload(rqstp); u32 rlen = op->u.readdir.rd_maxcount; - if (rlen > PAGE_SIZE) - rlen = PAGE_SIZE; + if (rlen > maxcount) + rlen = maxcount; - return (op_encode_hdr_size + op_encode_verifier_maxsz) - * sizeof(__be32) + rlen; + return (op_encode_hdr_size + op_encode_verifier_maxsz + + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1526,6 +1561,12 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + op_encode_change_info_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return NFS4_MAX_SESSIONID_LEN + 20; +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -1539,7 +1580,7 @@ static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_o static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); + return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1607,6 +1648,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, .op_flags = ALLOWED_ON_ABSENT_FS, + .op_rsize_bop = nfsd4_getattr_rsize, .op_name = "OP_GETATTR", }, [OP_GETFH] = { @@ -1676,37 +1718,32 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTPUBFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTROOTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, @@ -1864,14 +1901,33 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; -#ifdef NFSD_DEBUG +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + struct nfsd4_operation *opdesc; + nfsd4op_rsize estimator; + + if (op->opnum == OP_ILLEGAL) + return op_encode_hdr_size * sizeof(__be32); + opdesc = OPDESC(op); + estimator = opdesc->op_rsize_bop; + return estimator ? estimator(rqstp, op) : PAGE_SIZE; +} + +void warn_on_nonidempotent_op(struct nfsd4_op *op) +{ + if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { + pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + op->opnum, nfsd4_op_name(op->opnum)); + WARN_ON_ONCE(1); + } +} + static const char *nfsd4_op_name(unsigned opnum) { if (opnum < ARRAY_SIZE(nfsd4_ops)) return nfsd4_ops[opnum].op_name; return "unknown_operation"; } -#endif #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9a77a5a21557..c0d45cec9958 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -81,13 +81,13 @@ static DEFINE_MUTEX(client_mutex); * effort to decrease the scope of the client_mutex, this spinlock may * eventually cover more: */ -static DEFINE_SPINLOCK(recall_lock); +static DEFINE_SPINLOCK(state_lock); -static struct kmem_cache *openowner_slab = NULL; -static struct kmem_cache *lockowner_slab = NULL; -static struct kmem_cache *file_slab = NULL; -static struct kmem_cache *stateid_slab = NULL; -static struct kmem_cache *deleg_slab = NULL; +static struct kmem_cache *openowner_slab; +static struct kmem_cache *lockowner_slab; +static struct kmem_cache *file_slab; +static struct kmem_cache *stateid_slab; +static struct kmem_cache *deleg_slab; void nfs4_lock_state(void) @@ -235,9 +235,9 @@ static void nfsd4_free_file(struct nfs4_file *f) static inline void put_nfs4_file(struct nfs4_file *fi) { - if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del(&fi->fi_hash); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); iput(fi->fi_inode); nfsd4_free_file(fi); } @@ -375,7 +375,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - dp->dl_stid.sc_type = NFS4_DELEG_STID; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -418,6 +417,8 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_put_deleg_lease(struct nfs4_file *fp) { + if (!fp->fi_lease) + return; if (atomic_dec_and_test(&fp->fi_delegees)) { vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); fp->fi_lease = NULL; @@ -431,18 +432,30 @@ static void unhash_stid(struct nfs4_stid *s) s->sc_type = 0; } +static void +hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + spin_lock(&state_lock); list_del_init(&dp->dl_perclnt); - spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); - spin_unlock(&recall_lock); - nfs4_put_deleg_lease(dp->dl_file); - put_nfs4_file(dp->dl_file); - dp->dl_file = NULL; + spin_unlock(&state_lock); + if (dp->dl_file) { + nfs4_put_deleg_lease(dp->dl_file); + put_nfs4_file(dp->dl_file); + dp->dl_file = NULL; + } } @@ -645,6 +658,12 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) } } +static void nfs4_free_lockowner(struct nfs4_lockowner *lo) +{ + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} + static void release_lockowner(struct nfs4_lockowner *lo) { unhash_lockowner(lo); @@ -699,6 +718,12 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo) } } +static void nfs4_free_openowner(struct nfs4_openowner *oo) +{ + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); +} + static void release_openowner(struct nfs4_openowner *oo) { unhash_openowner(oo); @@ -1093,7 +1118,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return clp; } -static inline void +static void free_client(struct nfs4_client *clp) { struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); @@ -1136,13 +1161,13 @@ destroy_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); @@ -1544,6 +1569,7 @@ out_err: void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { + struct xdr_buf *buf = resp->xdr.buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; @@ -1557,11 +1583,9 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) slot->sl_datalen = 0; return; } - slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; - base = (char *)resp->cstate.datap - - (char *)resp->xbuf->head[0].iov_base; - if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, - slot->sl_datalen)) + base = resp->cstate.data_offset; + slot->sl_datalen = buf->len - base; + if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) WARN("%s: sessions DRC could not cache compound\n", __func__); return; } @@ -1602,6 +1626,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; __be32 status; dprintk("--> %s slot %p\n", __func__, slot); @@ -1610,14 +1636,16 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, if (status) return status; - /* The sequence operation has been encoded, cstate->datap set. */ - memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); + p = xdr_reserve_space(xdr, slot->sl_datalen); + if (!p) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); + xdr_commit_encode(xdr); resp->opcnt = slot->sl_opcnt; - resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); - status = slot->sl_status; - - return status; + return slot->sl_status; } /* @@ -2189,11 +2217,13 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_sequence *seq) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; + int buflen; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (resp->opcnt != 1) @@ -2262,6 +2292,16 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status) goto out_put_session; + buflen = (seq->cachethis) ? + session->se_fchannel.maxresp_cached : + session->se_fchannel.maxresp_sz; + status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : + nfserr_rep_too_big; + if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) + goto out_put_session; + svc_reserve(rqstp, buflen); + + status = nfs_ok; /* Success! bump slot seqid */ slot->sl_seqid = seq->seqid; slot->sl_flags |= NFSD4_SLOT_INUSE; @@ -2499,28 +2539,19 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); -} - -static void -nfsd4_free_slab(struct kmem_cache **slab) -{ - if (*slab == NULL) - return; - kmem_cache_destroy(*slab); - *slab = NULL; + spin_unlock(&state_lock); } void nfsd4_free_slabs(void) { - nfsd4_free_slab(&openowner_slab); - nfsd4_free_slab(&lockowner_slab); - nfsd4_free_slab(&file_slab); - nfsd4_free_slab(&stateid_slab); - nfsd4_free_slab(&deleg_slab); + kmem_cache_destroy(openowner_slab); + kmem_cache_destroy(lockowner_slab); + kmem_cache_destroy(file_slab); + kmem_cache_destroy(stateid_slab); + kmem_cache_destroy(deleg_slab); } int @@ -2529,42 +2560,38 @@ nfsd4_init_slabs(void) openowner_slab = kmem_cache_create("nfsd4_openowners", sizeof(struct nfs4_openowner), 0, 0, NULL); if (openowner_slab == NULL) - goto out_nomem; + goto out; lockowner_slab = kmem_cache_create("nfsd4_lockowners", sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) - goto out_nomem; + goto out_free_openowner_slab; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) - goto out_nomem; + goto out_free_lockowner_slab; stateid_slab = kmem_cache_create("nfsd4_stateids", sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) - goto out_nomem; + goto out_free_file_slab; deleg_slab = kmem_cache_create("nfsd4_delegations", sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) - goto out_nomem; + goto out_free_stateid_slab; return 0; -out_nomem: - nfsd4_free_slabs(); + +out_free_stateid_slab: + kmem_cache_destroy(stateid_slab); +out_free_file_slab: + kmem_cache_destroy(file_slab); +out_free_lockowner_slab: + kmem_cache_destroy(lockowner_slab); +out_free_openowner_slab: + kmem_cache_destroy(openowner_slab); +out: dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; } -void nfs4_free_openowner(struct nfs4_openowner *oo) -{ - kfree(oo->oo_owner.so_owner.data); - kmem_cache_free(openowner_slab, oo); -} - -void nfs4_free_lockowner(struct nfs4_lockowner *lo) -{ - kfree(lo->lo_owner.so_owner.data); - kmem_cache_free(lockowner_slab, lo); -} - static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; @@ -2685,15 +2712,15 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return fp; } } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return NULL; } @@ -2730,6 +2757,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&state_lock); /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2766,11 +2794,11 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); fp->fi_had_conflict = true; list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); } static @@ -3047,11 +3075,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) goto out_free; - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); fp->fi_lease = fl; fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); - list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_lock(&state_lock); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; out_free: locks_free_lock(fl); @@ -3060,33 +3089,21 @@ out_free: static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) { - int status; - if (fp->fi_had_conflict) return -EAGAIN; get_nfs4_file(fp); dp->dl_file = fp; - if (!fp->fi_lease) { - status = nfs4_setlease(dp); - if (status) - goto out_free; - return 0; - } - spin_lock(&recall_lock); + if (!fp->fi_lease) + return nfs4_setlease(dp); + spin_lock(&state_lock); + atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { - spin_unlock(&recall_lock); - status = -EAGAIN; - goto out_free; + spin_unlock(&state_lock); + return -EAGAIN; } - atomic_inc(&fp->fi_delegees); - list_add(&dp->dl_perfile, &fp->fi_delegations); - spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; -out_free: - put_nfs4_file(fp); - dp->dl_file = fp; - return status; } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -3173,8 +3190,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; return; out_free: - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + destroy_delegation(dp); out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && @@ -3391,8 +3407,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; - time_t t, clientid_val = nn->nfsd4_lease; - time_t u, test_val = nn->nfsd4_lease; + time_t t, new_timeo = nn->nfsd4_lease; nfs4_lock_state(); @@ -3404,8 +3419,7 @@ nfs4_laundromat(struct nfsd_net *nn) clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { t = clp->cl_time - cutoff; - if (clientid_val > t) - clientid_val = t; + new_timeo = min(new_timeo, t); break; } if (mark_client_expired_locked(clp)) { @@ -3422,39 +3436,35 @@ nfs4_laundromat(struct nfsd_net *nn) clp->cl_clientid.cl_id); expire_client(clp); } - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { - u = dp->dl_time - cutoff; - if (test_val > u) - test_val = u; + t = dp->dl_time - cutoff; + new_timeo = min(new_timeo, t); break; } list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); revoke_delegation(dp); } - test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { oo = container_of(pos, struct nfs4_openowner, oo_close_lru); if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { - u = oo->oo_time - cutoff; - if (test_val > u) - test_val = u; + t = oo->oo_time - cutoff; + new_timeo = min(new_timeo, t); break; } release_openowner(oo); } - if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) - clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); nfs4_unlock_state(); - return clientid_val; + return new_timeo; } static struct workqueue_struct *laundry_wq; @@ -3654,6 +3664,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct file *file = NULL; __be32 status; if (filpp) @@ -3665,10 +3676,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); + nfs4_lock_state(); + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion, nn); if (status) - return status; + goto out; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3679,8 +3692,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { - *filpp = dp->dl_file->fi_deleg_file; - if (!*filpp) { + file = dp->dl_file->fi_deleg_file; + if (!file) { WARN_ON_ONCE(1); status = nfserr_serverfault; goto out; @@ -3701,16 +3714,20 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, goto out; if (filpp) { if (flags & RD_STATE) - *filpp = find_readable_file(stp->st_file); + file = find_readable_file(stp->st_file); else - *filpp = find_writeable_file(stp->st_file); + file = find_writeable_file(stp->st_file); } break; default: - return nfserr_bad_stateid; + status = nfserr_bad_stateid; + goto out; } status = nfs_ok; + if (file) + *filpp = get_file(file); out: + nfs4_unlock_state(); return status; } @@ -3726,7 +3743,7 @@ nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) * correspondance, and we have to delete the lockowner when we * delete the lock stateid: */ - unhash_lockowner(lo); + release_lockowner(lo); return nfs_ok; } @@ -4896,6 +4913,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, struct nfs4_delegation *dp, *next; u64 count = 0; + lockdep_assert_held(&state_lock); list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { if (victims) list_move(&dp->dl_recall_lru, victims); @@ -4911,9 +4929,9 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) revoke_delegation(dp); @@ -4927,11 +4945,11 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return count; } @@ -4940,9 +4958,9 @@ u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) { u64 count = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, NULL); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); nfsd_print_count(clp, count, "delegations"); return count; @@ -4983,13 +5001,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_ #endif /* CONFIG_NFSD_FAULT_INJECTION */ -/* initialization to perform at module load time: */ - -void -nfs4_state_init(void) -{ -} - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -5160,12 +5171,12 @@ nfs4_state_shutdown_net(struct net *net) nfs4_lock_state(); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 18881f34737a..2d305a121f37 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -98,11 +98,6 @@ xdr_error: \ status = nfserr_bad_xdr; \ goto out -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) #define READMEM(x,nbytes) do { \ x = (char *)p; \ p += XDR_QUADLEN(nbytes); \ @@ -248,17 +243,17 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) bmval[2] = 0; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); if (bmlen > 1000) goto xdr_error; READ_BUF(bmlen << 2); if (bmlen > 0) - READ32(bmval[0]); + bmval[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bmval[1]); + bmval[1] = be32_to_cpup(p++); if (bmlen > 2) - READ32(bmval[2]); + bmval[2] = be32_to_cpup(p++); DECODE_TAIL; } @@ -270,6 +265,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, { int expected_len, len = 0; u32 dummy32; + u64 sec; char *buf; DECODE_HEAD; @@ -278,12 +274,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return status; READ_BUF(4); - READ32(expected_len); + expected_len = be32_to_cpup(p++); if (bmval[0] & FATTR4_WORD0_SIZE) { READ_BUF(8); len += 8; - READ64(iattr->ia_size); + p = xdr_decode_hyper(p, &iattr->ia_size); iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { @@ -291,7 +287,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct nfs4_ace *ace; READ_BUF(4); len += 4; - READ32(nace); + nace = be32_to_cpup(p++); if (nace > NFS4_ACL_MAX) return nfserr_fbig; @@ -305,10 +301,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, (*acl)->naces = nace; for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; - READ32(ace->type); - READ32(ace->flag); - READ32(ace->access_mask); - READ32(dummy32); + ace->type = be32_to_cpup(p++); + ace->flag = be32_to_cpup(p++); + ace->access_mask = be32_to_cpup(p++); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); @@ -330,14 +326,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_MODE) { READ_BUF(4); len += 4; - READ32(iattr->ia_mode); + iattr->ia_mode = be32_to_cpup(p++); iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -348,7 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -359,15 +355,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_atime.tv_sec); - READ32(iattr->ia_atime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_atime.tv_sec = (time_t)sec; + iattr->ia_atime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_atime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -382,15 +379,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_mtime.tv_sec); - READ32(iattr->ia_mtime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_mtime.tv_sec = sec; + iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -408,13 +406,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { READ_BUF(4); len += 4; - READ32(dummy32); /* lfs: we don't use it */ + dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ READ_BUF(4); len += 4; - READ32(dummy32); /* pi: we don't use it either */ + dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) return nfserr_badlabel; @@ -445,7 +443,7 @@ nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(sid->si_generation); + sid->si_generation = be32_to_cpup(p++); COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -457,7 +455,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_HEAD; READ_BUF(4); - READ32(access->ac_req_access); + access->ac_req_access = be32_to_cpup(p++); DECODE_TAIL; } @@ -472,7 +470,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* callback_sec_params4 */ READ_BUF(4); - READ32(nr_secflavs); + nr_secflavs = be32_to_cpup(p++); if (nr_secflavs) cbs->flavor = (u32)(-1); else @@ -480,7 +478,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ cbs->flavor = 0; for (i = 0; i < nr_secflavs; ++i) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); switch (dummy) { case RPC_AUTH_NULL: /* Nothing to read */ @@ -490,21 +488,21 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ case RPC_AUTH_UNIX: READ_BUF(8); /* stamp */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* machine name */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); SAVEMEM(machine_name, dummy); /* uid, gid */ READ_BUF(8); - READ32(uid); - READ32(gid); + uid = be32_to_cpup(p++); + gid = be32_to_cpup(p++); /* more gids */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); if (cbs->flavor == (u32)(-1)) { kuid_t kuid = make_kuid(&init_user_ns, uid); @@ -524,14 +522,14 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ "not supported!\n"); READ_BUF(8); /* gcbp_service */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* gcbp_handle_from_server */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* gcbp_handle_from_client */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); break; default: @@ -547,7 +545,7 @@ static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, stru DECODE_HEAD; READ_BUF(4); - READ32(bc->bc_cb_program); + bc->bc_cb_program = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); DECODE_TAIL; @@ -559,7 +557,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(bcts->dir); + bcts->dir = be32_to_cpup(p++); /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker * could help us figure out we should be using it. */ DECODE_TAIL; @@ -571,7 +569,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; READ_BUF(4); - READ32(close->cl_seqid); + close->cl_seqid = be32_to_cpup(p++); return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; @@ -584,8 +582,8 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit DECODE_HEAD; READ_BUF(12); - READ64(commit->co_offset); - READ32(commit->co_count); + p = xdr_decode_hyper(p, &commit->co_offset); + commit->co_count = be32_to_cpup(p++); DECODE_TAIL; } @@ -596,19 +594,19 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create DECODE_HEAD; READ_BUF(4); - READ32(create->cr_type); + create->cr_type = be32_to_cpup(p++); switch (create->cr_type) { case NF4LNK: READ_BUF(4); - READ32(create->cr_linklen); + create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); SAVEMEM(create->cr_linkname, create->cr_linklen); break; case NF4BLK: case NF4CHR: READ_BUF(8); - READ32(create->cr_specdata1); - READ32(create->cr_specdata2); + create->cr_specdata1 = be32_to_cpup(p++); + create->cr_specdata2 = be32_to_cpup(p++); break; case NF4SOCK: case NF4FIFO: @@ -618,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } READ_BUF(4); - READ32(create->cr_namelen); + create->cr_namelen = be32_to_cpup(p++); READ_BUF(create->cr_namelen); SAVEMEM(create->cr_name, create->cr_namelen); if ((status = check_filename(create->cr_name, create->cr_namelen))) @@ -650,7 +648,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) DECODE_HEAD; READ_BUF(4); - READ32(link->li_namelen); + link->li_namelen = be32_to_cpup(p++); READ_BUF(link->li_namelen); SAVEMEM(link->li_name, link->li_namelen); if ((status = check_filename(link->li_name, link->li_namelen))) @@ -668,24 +666,24 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ READ_BUF(28); - READ32(lock->lk_type); + lock->lk_type = be32_to_cpup(p++); if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(lock->lk_reclaim); - READ64(lock->lk_offset); - READ64(lock->lk_length); - READ32(lock->lk_is_new); + lock->lk_reclaim = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lock->lk_offset); + p = xdr_decode_hyper(p, &lock->lk_length); + lock->lk_is_new = be32_to_cpup(p++); if (lock->lk_is_new) { READ_BUF(4); - READ32(lock->lk_new_open_seqid); + lock->lk_new_open_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); if (status) return status; READ_BUF(8 + sizeof(clientid_t)); - READ32(lock->lk_new_lock_seqid); + lock->lk_new_lock_seqid = be32_to_cpup(p++); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - READ32(lock->lk_new_owner.len); + lock->lk_new_owner.len = be32_to_cpup(p++); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { @@ -693,7 +691,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) if (status) return status; READ_BUF(4); - READ32(lock->lk_old_lock_seqid); + lock->lk_old_lock_seqid = be32_to_cpup(p++); } DECODE_TAIL; @@ -705,13 +703,13 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) DECODE_HEAD; READ_BUF(32); - READ32(lockt->lt_type); + lockt->lt_type = be32_to_cpup(p++); if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) goto xdr_error; - READ64(lockt->lt_offset); - READ64(lockt->lt_length); + p = xdr_decode_hyper(p, &lockt->lt_offset); + p = xdr_decode_hyper(p, &lockt->lt_length); COPYMEM(&lockt->lt_clientid, 8); - READ32(lockt->lt_owner.len); + lockt->lt_owner.len = be32_to_cpup(p++); READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); @@ -724,16 +722,16 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; READ_BUF(8); - READ32(locku->lu_type); + locku->lu_type = be32_to_cpup(p++); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(locku->lu_seqid); + locku->lu_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &locku->lu_stateid); if (status) return status; READ_BUF(16); - READ64(locku->lu_offset); - READ64(locku->lu_length); + p = xdr_decode_hyper(p, &locku->lu_offset); + p = xdr_decode_hyper(p, &locku->lu_length); DECODE_TAIL; } @@ -744,7 +742,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_HEAD; READ_BUF(4); - READ32(lookup->lo_len); + lookup->lo_len = be32_to_cpup(p++); READ_BUF(lookup->lo_len); SAVEMEM(lookup->lo_name, lookup->lo_len); if ((status = check_filename(lookup->lo_name, lookup->lo_len))) @@ -759,7 +757,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh u32 w; READ_BUF(4); - READ32(w); + w = be32_to_cpup(p++); *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -811,7 +809,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) __be32 *p; READ_BUF(4); - READ32(*x); + *x = be32_to_cpup(p++); /* Note: unlinke access bits, deny bits may be zero. */ if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; @@ -825,7 +823,7 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne __be32 *p; READ_BUF(4); - READ32(o->len); + o->len = be32_to_cpup(p++); if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) return nfserr_bad_xdr; @@ -850,7 +848,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_xdr_error = 0; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); - READ32(open->op_seqid); + open->op_seqid = be32_to_cpup(p++); /* decode, yet ignore deleg_when until supported */ status = nfsd4_decode_share_access(argp, &open->op_share_access, &open->op_deleg_want, &dummy); @@ -865,13 +863,13 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if (status) goto xdr_error; READ_BUF(4); - READ32(open->op_create); + open->op_create = be32_to_cpup(p++); switch (open->op_create) { case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: READ_BUF(4); - READ32(open->op_createmode); + open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: @@ -904,12 +902,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) /* open_claim */ READ_BUF(4); - READ32(open->op_claim_type); + open->op_claim_type = be32_to_cpup(p++); switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_PREV: READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -917,14 +915,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) break; case NFS4_OPEN_CLAIM_PREVIOUS: READ_BUF(4); - READ32(open->op_delegate_type); + open->op_delegate_type = be32_to_cpup(p++); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); if (status) return status; READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -962,7 +960,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con if (status) return status; READ_BUF(4); - READ32(open_conf->oc_seqid); + open_conf->oc_seqid = be32_to_cpup(p++); DECODE_TAIL; } @@ -976,7 +974,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d if (status) return status; READ_BUF(4); - READ32(open_down->od_seqid); + open_down->od_seqid = be32_to_cpup(p++); status = nfsd4_decode_share_access(argp, &open_down->od_share_access, &open_down->od_deleg_want, NULL); if (status) @@ -993,7 +991,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) DECODE_HEAD; READ_BUF(4); - READ32(putfh->pf_fhlen); + putfh->pf_fhlen = be32_to_cpup(p++); if (putfh->pf_fhlen > NFS4_FHSIZE) goto xdr_error; READ_BUF(putfh->pf_fhlen); @@ -1019,8 +1017,8 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) if (status) return status; READ_BUF(12); - READ64(read->rd_offset); - READ32(read->rd_length); + p = xdr_decode_hyper(p, &read->rd_offset); + read->rd_length = be32_to_cpup(p++); DECODE_TAIL; } @@ -1031,10 +1029,10 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read DECODE_HEAD; READ_BUF(24); - READ64(readdir->rd_cookie); + p = xdr_decode_hyper(p, &readdir->rd_cookie); COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - READ32(readdir->rd_dircount); /* just in case you needed a useless field... */ - READ32(readdir->rd_maxcount); + readdir->rd_dircount = be32_to_cpup(p++); + readdir->rd_maxcount = be32_to_cpup(p++); if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) goto out; @@ -1047,7 +1045,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove DECODE_HEAD; READ_BUF(4); - READ32(remove->rm_namelen); + remove->rm_namelen = be32_to_cpup(p++); READ_BUF(remove->rm_namelen); SAVEMEM(remove->rm_name, remove->rm_namelen); if ((status = check_filename(remove->rm_name, remove->rm_namelen))) @@ -1062,10 +1060,10 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename DECODE_HEAD; READ_BUF(4); - READ32(rename->rn_snamelen); + rename->rn_snamelen = be32_to_cpup(p++); READ_BUF(rename->rn_snamelen + 4); SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ32(rename->rn_tnamelen); + rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) @@ -1097,7 +1095,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(secinfo->si_namelen); + secinfo->si_namelen = be32_to_cpup(p++); READ_BUF(secinfo->si_namelen); SAVEMEM(secinfo->si_name, secinfo->si_namelen); status = check_filename(secinfo->si_name, secinfo->si_namelen); @@ -1113,7 +1111,7 @@ nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(sin->sin_style); + sin->sin_style = be32_to_cpup(p++); DECODE_TAIL; } @@ -1144,16 +1142,16 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient if (status) return nfserr_bad_xdr; READ_BUF(8); - READ32(setclientid->se_callback_prog); - READ32(setclientid->se_callback_netid_len); + setclientid->se_callback_prog = be32_to_cpup(p++); + setclientid->se_callback_netid_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_netid_len + 4); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ32(setclientid->se_callback_addr_len); + setclientid->se_callback_addr_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_addr_len + 4); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ32(setclientid->se_callback_ident); + setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; } @@ -1186,7 +1184,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify * nfsd4_proc_verify */ READ_BUF(4); - READ32(verify->ve_attrlen); + verify->ve_attrlen = be32_to_cpup(p++); READ_BUF(verify->ve_attrlen); SAVEMEM(verify->ve_attrval, verify->ve_attrlen); @@ -1204,11 +1202,11 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) if (status) return status; READ_BUF(16); - READ64(write->wr_offset); - READ32(write->wr_stable_how); + p = xdr_decode_hyper(p, &write->wr_offset); + write->wr_stable_how = be32_to_cpup(p++); if (write->wr_stable_how > 2) goto xdr_error; - READ32(write->wr_buflen); + write->wr_buflen = be32_to_cpup(p++); /* Sorry .. no magic macros for this.. * * READ_BUF(write->wr_buflen); @@ -1254,7 +1252,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel READ_BUF(12); COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - READ32(rlockowner->rl_owner.len); + rlockowner->rl_owner.len = be32_to_cpup(p++); READ_BUF(rlockowner->rl_owner.len); READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); @@ -1278,63 +1276,63 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, return nfserr_bad_xdr; READ_BUF(4); - READ32(exid->flags); + exid->flags = be32_to_cpup(p++); /* Ignore state_protect4_a */ READ_BUF(4); - READ32(exid->spa_how); + exid->spa_how = be32_to_cpup(p++); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: /* spo_must_enforce */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* spo_must_allow */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; break; case SP4_SSV: /* ssp_ops */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* ssp_hash_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_encr_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_window and ssp_num_gss_handles */ READ_BUF(8); - READ32(dummy); - READ32(dummy); + dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p++); break; default: goto xdr_error; @@ -1342,7 +1340,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, /* Ignore Implementation ID */ READ_BUF(4); /* nfs_impl_id4 array length */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy > 1) goto xdr_error; @@ -1350,13 +1348,13 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, if (dummy == 1) { /* nii_domain */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* nii_name */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); @@ -1376,21 +1374,21 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(16); COPYMEM(&sess->clientid, 8); - READ32(sess->seqid); - READ32(sess->flags); + sess->seqid = be32_to_cpup(p++); + sess->flags = be32_to_cpup(p++); /* Fore channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->fore_channel.maxreq_sz); - READ32(sess->fore_channel.maxresp_sz); - READ32(sess->fore_channel.maxresp_cached); - READ32(sess->fore_channel.maxops); - READ32(sess->fore_channel.maxreqs); - READ32(sess->fore_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->fore_channel.maxreq_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_cached = be32_to_cpup(p++); + sess->fore_channel.maxops = be32_to_cpup(p++); + sess->fore_channel.maxreqs = be32_to_cpup(p++); + sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->fore_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->fore_channel.rdma_attrs); + sess->fore_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->fore_channel.nr_rdma_attrs > 1) { dprintk("Too many fore channel attr bitmaps!\n"); goto xdr_error; @@ -1398,23 +1396,23 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->back_channel.maxreq_sz); - READ32(sess->back_channel.maxresp_sz); - READ32(sess->back_channel.maxresp_cached); - READ32(sess->back_channel.maxops); - READ32(sess->back_channel.maxreqs); - READ32(sess->back_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->back_channel.maxreq_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_cached = be32_to_cpup(p++); + sess->back_channel.maxops = be32_to_cpup(p++); + sess->back_channel.maxreqs = be32_to_cpup(p++); + sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->back_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->back_channel.rdma_attrs); + sess->back_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->back_channel.nr_rdma_attrs > 1) { dprintk("Too many back channel attr bitmaps!\n"); goto xdr_error; } READ_BUF(4); - READ32(sess->callback_prog); + sess->callback_prog = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &sess->cb_sec); DECODE_TAIL; } @@ -1437,7 +1435,7 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(free_stateid->fr_stateid.si_generation); + free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -1451,10 +1449,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(seq->seqid); - READ32(seq->slotid); - READ32(seq->maxslots); - READ32(seq->cachethis); + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p++); DECODE_TAIL; } @@ -1511,7 +1509,7 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_HEAD; READ_BUF(4); - READ32(rc->rca_one_fs); + rc->rca_one_fs = be32_to_cpup(p++); DECODE_TAIL; } @@ -1605,47 +1603,25 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) return true; } -/* - * Return a rough estimate of the maximum possible reply size. Note the - * estimate includes rpc headers so is meant to be passed to - * svc_reserve, not svc_reserve_auth. - * - * Also note the current compound encoding permits only one operation to - * use pages beyond the first one, so the maximum possible length is the - * maximum over these values, not the sum. - */ -static int nfsd4_max_reply(u32 opnum) -{ - switch (opnum) { - case OP_READLINK: - case OP_READDIR: - /* - * Both of these ops take a single page for data and put - * the head and tail in another page: - */ - return 2 * PAGE_SIZE; - case OP_READ: - return INT_MAX; - default: - return PAGE_SIZE; - } -} - static __be32 nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { DECODE_HEAD; struct nfsd4_op *op; bool cachethis = false; - int max_reply = PAGE_SIZE; + int auth_slack= argp->rqstp->rq_auth_slack; + int max_reply = auth_slack + 8; /* opcnt, status */ + int readcount = 0; + int readbytes = 0; int i; READ_BUF(4); - READ32(argp->taglen); + argp->taglen = be32_to_cpup(p++); READ_BUF(argp->taglen + 8); SAVEMEM(argp->tag, argp->taglen); - READ32(argp->minorversion); - READ32(argp->opcnt); + argp->minorversion = be32_to_cpup(p++); + argp->opcnt = be32_to_cpup(p++); + max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); if (argp->taglen > NFSD4_MAX_TAGLEN) goto xdr_error; @@ -1669,7 +1645,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op->replay = NULL; READ_BUF(4); - READ32(op->opnum); + op->opnum = be32_to_cpup(p++); if (nfsd4_opnum_in_range(argp, op)) op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); @@ -1677,97 +1653,82 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } - - if (op->status) { - argp->opcnt = i+1; - break; - } /* * We'll try to cache the result in the DRC if any one * op in the compound wants to be cached: */ cachethis |= nfsd4_cache_this_op(op); - max_reply = max(max_reply, nfsd4_max_reply(op->opnum)); + if (op->opnum == OP_READ) { + readcount++; + readbytes += nfsd4_max_reply(argp->rqstp, op); + } else + max_reply += nfsd4_max_reply(argp->rqstp, op); + + if (op->status) { + argp->opcnt = i+1; + break; + } } /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - if (max_reply != INT_MAX) - svc_reserve(argp->rqstp, max_reply); + svc_reserve(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; - DECODE_TAIL; -} - -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((u32)((n) >> 32)); \ - *p++ = htonl((u32)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ - *(p + XDR_QUADLEN(nbytes) -1) = 0; \ - memcpy(p, ptr, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -}} while (0) - -static void write32(__be32 **p, u32 n) -{ - *(*p)++ = htonl(n); -} + if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) + argp->rqstp->rq_splice_ok = false; -static void write64(__be32 **p, u64 n) -{ - write32(p, (n >> 32)); - write32(p, (u32)n); + DECODE_TAIL; } -static void write_change(__be32 **p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) { if (IS_I_VERSION(inode)) { - write64(p, inode->i_version); + p = xdr_encode_hyper(p, inode->i_version); } else { - write32(p, stat->ctime.tv_sec); - write32(p, stat->ctime.tv_nsec); + *p++ = cpu_to_be32(stat->ctime.tv_sec); + *p++ = cpu_to_be32(stat->ctime.tv_nsec); } + return p; } -static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) +static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { - write32(p, c->atomic); + *p++ = cpu_to_be32(c->atomic); if (c->change_supported) { - write64(p, c->before_change); - write64(p, c->after_change); + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); } else { - write32(p, c->before_ctime_sec); - write32(p, c->before_ctime_nsec); - write32(p, c->after_ctime_sec); - write32(p, c->after_ctime_nsec); + *p++ = cpu_to_be32(c->before_ctime_sec); + *p++ = cpu_to_be32(c->before_ctime_nsec); + *p++ = cpu_to_be32(c->after_ctime_sec); + *p++ = cpu_to_be32(c->after_ctime_nsec); } + return p; } -#define RESERVE_SPACE(nbytes) do { \ - p = resp->p; \ - BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \ -} while (0) -#define ADJUST_ARGS() resp->p = p - /* Encode as an array of strings the string given with components * separated @sep, escaped with esc_enter and esc_exit. */ -static __be32 nfsd4_encode_components_esc(char sep, char *components, - __be32 **pp, int *buflen, - char esc_enter, char esc_exit) +static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, + char *components, char esc_enter, + char esc_exit) { - __be32 *p = *pp; - __be32 *countp = p; + __be32 *p; + __be32 pathlen; + int pathlen_offset; int strlen, count=0; char *str, *end, *next; dprintk("nfsd4_encode_components(%s)\n", components); - if ((*buflen -= 4) < 0) + + pathlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(0); /* We will fill this in with @count later */ + p++; /* We will fill this in with @count later */ + end = str = components; while (*end) { bool found_esc = false; @@ -1789,59 +1750,57 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components, strlen = end - str; if (strlen) { - if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) + p = xdr_reserve_space(xdr, strlen + 4); + if (!p) return nfserr_resource; - WRITE32(strlen); - WRITEMEM(str, strlen); + p = xdr_encode_opaque(p, str, strlen); count++; } else end++; str = end; } - *pp = p; - p = countp; - WRITE32(count); + pathlen = htonl(xdr->buf->len - pathlen_offset); + write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; } /* Encode as an array of strings the string given with components * separated @sep. */ -static __be32 nfsd4_encode_components(char sep, char *components, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, + char *components) { - return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0); + return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } /* * encode a location element of a fs_locations structure */ -static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, + struct nfsd4_fs_location *location) { __be32 status; - __be32 *p = *pp; - status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen, + status = nfsd4_encode_components_esc(xdr, ':', location->hosts, '[', ']'); if (status) return status; - status = nfsd4_encode_components('/', location->path, &p, buflen); + status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - *pp = p; return 0; } /* * Encode a path in RFC3530 'pathname4' format */ -static __be32 nfsd4_encode_path(const struct path *root, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_path(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; - __be32 *p = *pp; + __be32 *p; struct dentry **components = NULL; unsigned int ncomponents = 0; __be32 err = nfserr_jukebox; @@ -1872,11 +1831,11 @@ static __be32 nfsd4_encode_path(const struct path *root, components[ncomponents++] = cur.dentry; cur.dentry = dget_parent(cur.dentry); } - - *buflen -= 4; - if (*buflen < 0) + err = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_free; - WRITE32(ncomponents); + *p++ = cpu_to_be32(ncomponents); while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; @@ -1884,20 +1843,18 @@ static __be32 nfsd4_encode_path(const struct path *root, spin_lock(&dentry->d_lock); len = dentry->d_name.len; - *buflen -= 4 + (XDR_QUADLEN(len) << 2); - if (*buflen < 0) { + p = xdr_reserve_space(xdr, len + 4); + if (!p) { spin_unlock(&dentry->d_lock); goto out_free; } - WRITE32(len); - WRITEMEM(dentry->d_name.name, len); + p = xdr_encode_opaque(p, dentry->d_name.name, len); dprintk("/%s", dentry->d_name.name); spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; } - *pp = p; err = 0; out_free: dprintk(")\n"); @@ -1908,8 +1865,8 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, + struct svc_rqst *rqstp, const struct path *path) { struct svc_export *exp_ps; __be32 res; @@ -1917,7 +1874,7 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); exp_put(exp_ps); return res; } @@ -1925,28 +1882,26 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, /* * encode a fs_locations structure */ -static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, - struct svc_export *exp, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, + struct svc_rqst *rqstp, struct svc_export *exp) { __be32 status; int i; - __be32 *p = *pp; + __be32 *p; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); if (status) return status; - if ((*buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(fslocs->locations_count); + *p++ = cpu_to_be32(fslocs->locations_count); for (i=0; i<fslocs->locations_count; i++) { - status = nfsd4_encode_fs_location4(&fslocs->locations[i], - &p, buflen); + status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); if (status) return status; } - *pp = p; return 0; } @@ -1965,15 +1920,15 @@ static u32 nfs4_file_type(umode_t mode) } static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, - __be32 **p, int *buflen) +nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { if (ace->whotype != NFS4_ACL_WHO_NAMED) - return nfs4_acl_write_who(ace->whotype, p, buflen); + return nfs4_acl_write_who(xdr, ace->whotype); else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen); + return nfsd4_encode_group(xdr, rqstp, ace->who_gid); else - return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen); + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -1982,31 +1937,28 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { - __be32 *p = *pp; + __be32 *p; - if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4)) + p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + if (!p) return nfserr_resource; /* * For now we use a 0 here to indicate the null translation; in * the future we may place a call to translation code here. */ - if ((*buflen -= 8) < 0) - return nfserr_resource; - - WRITE32(0); /* lfs */ - WRITE32(0); /* pi */ + *p++ = cpu_to_be32(0); /* lfs */ + *p++ = cpu_to_be32(0); /* pi */ p = xdr_encode_opaque(p, context, len); - *buflen -= (XDR_QUADLEN(len) << 2) + 4; - - *pp = p; return 0; } #else static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { return 0; } #endif @@ -2045,12 +1997,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. - * - * countp is the buffer size in _words_ */ -__be32 -nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int count, u32 *bmval, +static __be32 +nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, + struct svc_export *exp, + struct dentry *dentry, u32 *bmval, struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; @@ -2059,12 +2010,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct kstat stat; struct svc_fh *tempfh = NULL; struct kstatfs statfs; - int buflen = count << 2; - __be32 *attrlenp; + __be32 *p; + int starting_len = xdr->buf->len; + int attrlen_offset; + __be32 attrlen; u32 dummy; u64 dummy64; u32 rdattr_err = 0; - __be32 *p = *buffer; __be32 status; int err; int aclsupport = 0; @@ -2095,8 +2047,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, err = vfs_getattr(&path, &stat); if (err) goto out_nfserr; - if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | - FATTR4_WORD0_MAXNAME)) || + if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &statfs); @@ -2145,25 +2097,33 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ if (bmval2) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(bmval0); - WRITE32(bmval1); - WRITE32(bmval2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); } else { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32(1); - WRITE32(bmval0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); } - attrlenp = p++; /* to be backfilled later */ + + attrlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { u32 word0 = nfsd_suppattrs0(minorversion); @@ -2175,296 +2135,343 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (!contextsupport) word2 &= ~FATTR4_WORD2_SECURITY_LABEL; if (!word2) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(word0); - WRITE32(word1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); } else { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(word0); - WRITE32(word1); - WRITE32(word2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(word2); } } if (bmval0 & FATTR4_WORD0_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) { status = nfserr_serverfault; goto out; } - WRITE32(dummy); + *p++ = cpu_to_be32(dummy); } if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - WRITE32(NFS4_FH_PERSISTENT); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); else - WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| + NFS4_FH_VOL_RENAME); } if (bmval0 & FATTR4_WORD0_CHANGE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - write_change(&p, &stat, dentry->d_inode); + p = encode_change(p, &stat, dentry->d_inode); } if (bmval0 & FATTR4_WORD0_SIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.size); + p = xdr_encode_hyper(p, stat.size); } if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_FSID) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; if (exp->ex_fslocs.migrated) { - WRITE64(NFS4_REFERRAL_FSID_MAJOR); - WRITE64(NFS4_REFERRAL_FSID_MINOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); } else switch(fsid_source(fhp)) { case FSIDSOURCE_FSID: - WRITE64((u64)exp->ex_fsid); - WRITE64((u64)0); + p = xdr_encode_hyper(p, (u64)exp->ex_fsid); + p = xdr_encode_hyper(p, (u64)0); break; case FSIDSOURCE_DEV: - WRITE32(0); - WRITE32(MAJOR(stat.dev)); - WRITE32(0); - WRITE32(MINOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MAJOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MINOR(stat.dev)); break; case FSIDSOURCE_UUID: - WRITEMEM(exp->ex_uuid, 16); + p = xdr_encode_opaque_fixed(p, exp->ex_uuid, + EX_UUID_LEN); break; } } if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(nn->nfsd4_lease); + *p++ = cpu_to_be32(nn->nfsd4_lease); } if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(rdattr_err); + *p++ = cpu_to_be32(rdattr_err); } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; if (acl == NULL) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); goto out_acl; } - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(acl->naces); + *p++ = cpu_to_be32(acl->naces); for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - if ((buflen -= 4*3) < 0) + p = xdr_reserve_space(xdr, 4*3); + if (!p) goto out_resource; - WRITE32(ace->type); - WRITE32(ace->flag); - WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); + *p++ = cpu_to_be32(ace->type); + *p++ = cpu_to_be32(ace->flag); + *p++ = cpu_to_be32(ace->access_mask & + NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(xdr, rqstp, ace); if (status) goto out; } } out_acl: if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(aclsupport ? + *p++ = cpu_to_be32(aclsupport ? ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); } if (bmval0 & FATTR4_WORD0_CANSETTIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4; - if (buflen < 0) + p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); + if (!p) goto out_resource; - WRITE32(fhp->fh_handle.fh_size); - WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size); + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, + fhp->fh_handle.fh_size); } if (bmval0 & FATTR4_WORD0_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_files); + p = xdr_encode_hyper(p, (u64) statfs.f_files); } if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); + status = nfsd4_encode_fs_locations(xdr, rqstp, exp); if (status) goto out; } if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(exp->ex_path.mnt->mnt_sb->s_maxbytes); + p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); } if (bmval0 & FATTR4_WORD0_MAXLINK) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(255); + *p++ = cpu_to_be32(255); } if (bmval0 & FATTR4_WORD0_MAXNAME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(statfs.f_namelen); + *p++ = cpu_to_be32(statfs.f_namelen); } if (bmval0 & FATTR4_WORD0_MAXREAD) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval0 & FATTR4_WORD0_MAXWRITE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval1 & FATTR4_WORD1_MODE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.mode & S_IALLUGO); + *p++ = cpu_to_be32(stat.mode & S_IALLUGO); } if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval1 & FATTR4_WORD1_NUMLINKS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.nlink); + *p++ = cpu_to_be32(stat.nlink); } if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); + status = nfsd4_encode_user(xdr, rqstp, stat.uid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); + status = nfsd4_encode_group(xdr, rqstp, stat.gid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_RAWDEV) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32((u32) MAJOR(stat.rdev)); - WRITE32((u32) MINOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); } if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_USED) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)stat.blocks << 9; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.atime.tv_sec); - WRITE32(stat.atime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); + *p++ = cpu_to_be32(stat.atime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(0); - WRITE32(1); - WRITE32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(0); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.ctime.tv_sec); - WRITE32(stat.ctime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); + *p++ = cpu_to_be32(stat.ctime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.mtime.tv_sec); - WRITE32(stat.mtime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); + *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; /* * Get parent's attributes if not ignoring crossmount @@ -2473,25 +2480,26 @@ out_acl: if (ignore_crossmnt == 0 && dentry == exp->ex_path.mnt->mnt_root) get_parent_attributes(exp, &stat); - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(rqstp, context, - contextlen, &p, &buflen); + status = nfsd4_encode_security_label(xdr, rqstp, context, + contextlen); if (status) goto out; } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); } - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); - *buffer = p; + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); status = nfs_ok; out: @@ -2504,6 +2512,8 @@ out: fh_put(tempfh); kfree(tempfh); } + if (status) + xdr_truncate_encode(xdr, starting_len); return status; out_nfserr: status = nfserrno(err); @@ -2513,6 +2523,37 @@ out_resource: goto out; } +static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr, + struct xdr_buf *buf, __be32 *p, int bytes) +{ + xdr->scratch.iov_len = 0; + memset(buf, 0, sizeof(struct xdr_buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = 0; + buf->len = 0; + xdr->buf = buf; + xdr->iov = buf->head; + xdr->p = p; + xdr->end = (void *)p + bytes; + buf->buflen = bytes; +} + +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + struct xdr_buf dummy; + struct xdr_stream xdr; + __be32 ret; + + svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); + ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, + ignore_crossmnt); + *p = xdr.p; + return ret; +} + static inline int attributes_need_mount(u32 *bmval) { if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) @@ -2523,8 +2564,8 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, - const char *name, int namlen, __be32 **p, int buflen) +nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, + const char *name, int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -2576,7 +2617,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, } out_encode: - nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, + nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, cd->rd_rqstp, ignore_crossmnt); out_put: dput(dentry); @@ -2585,9 +2626,12 @@ out_put: } static __be32 * -nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) +nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - if (buflen < 6) + __be32 *p; + + p = xdr_reserve_space(xdr, 6); + if (!p) return NULL; *p++ = htonl(2); *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ @@ -2604,10 +2648,13 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, { struct readdir_cd *ccd = ccdv; struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); - int buflen; - __be32 *p = cd->buffer; - __be32 *cookiep; + struct xdr_stream *xdr = cd->xdr; + int start_offset = xdr->buf->len; + int cookie_offset; + int entry_bytes; __be32 nfserr = nfserr_toosmall; + __be64 wire_offset; + __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -2615,19 +2662,24 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->offset) - xdr_encode_hyper(cd->offset, (u64) offset); + if (cd->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, + &wire_offset, 8); + } - buflen = cd->buflen - 4 - XDR_QUADLEN(namlen); - if (buflen < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto fail; - *p++ = xdr_one; /* mark entry present */ - cookiep = p; + cookie_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 3*4 + namlen); + if (!p) + goto fail; p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ - nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen); + nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -2646,59 +2698,74 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(p, buflen, nfserr); + p = nfsd4_encode_rdattr_error(xdr, nfserr); if (p == NULL) { nfserr = nfserr_toosmall; goto fail; } } - cd->buflen -= (p - cd->buffer); - cd->buffer = p; - cd->offset = cookiep; + nfserr = nfserr_toosmall; + entry_bytes = xdr->buf->len - start_offset; + if (entry_bytes > cd->rd_maxcount) + goto fail; + cd->rd_maxcount -= entry_bytes; + if (!cd->rd_dircount) + goto fail; + cd->rd_dircount--; + cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; return 0; fail: + xdr_truncate_encode(xdr, start_offset); cd->common.err = nfserr; return -EINVAL; } -static void -nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +static __be32 +nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(sid->si_generation); - WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, sizeof(stateid_t)); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sid->si_generation); + p = xdr_encode_opaque_fixed(p, &sid->si_opaque, + sizeof(stateid_opaque_t)); + return 0; } static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8); - WRITE32(access->ac_supported); - WRITE32(access->ac_resp_access); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); } return nfserr; } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); - WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(bcts->dir); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); /* Sorry, we do not yet support RDMA over 4.1: */ - WRITE32(0); - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -2706,8 +2773,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &close->cl_stateid); + nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid); return nfserr; } @@ -2716,12 +2785,15 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -2729,15 +2801,17 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(32); - write_cinfo(&p, &create->cr_cinfo); - WRITE32(2); - WRITE32(create->cr_bmval[0]); - WRITE32(create->cr_bmval[1]); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &create->cr_cinfo); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(create->cr_bmval[0]); + *p++ = cpu_to_be32(create->cr_bmval[1]); } return nfserr; } @@ -2746,14 +2820,13 @@ static __be32 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) { struct svc_fh *fhp = getattr->ga_fhp; - int buflen; + struct xdr_stream *xdr = &resp->xdr; if (nfserr) return nfserr; - buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); - nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, - &resp->p, buflen, getattr->ga_bmval, + nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, resp->rqstp, 0); return nfserr; } @@ -2761,16 +2834,17 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { + struct xdr_stream *xdr = &resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; __be32 *p; if (!nfserr) { len = fhp->fh_handle.fh_size; - RESERVE_SPACE(len + 4); - WRITE32(len); - WRITEMEM(&fhp->fh_handle.fh_base, len); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); } return nfserr; } @@ -2779,52 +2853,69 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh * Including all fields other than the name, a LOCK4denied structure requires * 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. */ -static void -nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) +static __be32 +nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) { struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(conf->len)); - WRITE64(ld->ld_start); - WRITE64(ld->ld_length); - WRITE32(ld->ld_type); +again: + p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); + if (!p) { + /* + * Don't fail to return the result just because we can't + * return the conflicting open: + */ + if (conf->len) { + conf->len = 0; + conf->data = NULL; + goto again; + } + return nfserr_resource; + } + p = xdr_encode_hyper(p, ld->ld_start); + p = xdr_encode_hyper(p, ld->ld_length); + *p++ = cpu_to_be32(ld->ld_type); if (conf->len) { - WRITEMEM(&ld->ld_clientid, 8); - WRITE32(conf->len); - WRITEMEM(conf->data, conf->len); - kfree(conf->data); + p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); + p = xdr_encode_opaque(p, conf->data, conf->len); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - WRITE64((u64)0); /* clientid */ - WRITE32(0); /* length of owner name */ + p = xdr_encode_hyper(p, (u64)0); /* clientid */ + *p++ = cpu_to_be32(0); /* length of owner name */ } - ADJUST_ARGS(); + return nfserr_denied; } static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lock->lk_denied); - + nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); + kfree(lock->lk_denied.ld_owner.data); return nfserr; } static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { + struct xdr_stream *xdr = &resp->xdr; + if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); return nfserr; } static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &locku->lu_stateid); + nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid); return nfserr; } @@ -2833,12 +2924,14 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &link->li_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); } return nfserr; } @@ -2847,72 +2940,86 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) goto out; - nfsd4_encode_stateid(resp, &open->op_stateid); - RESERVE_SPACE(40); - write_cinfo(&p, &open->op_cinfo); - WRITE32(open->op_rflags); - WRITE32(2); - WRITE32(open->op_bmval[0]); - WRITE32(open->op_bmval[1]); - WRITE32(open->op_delegate_type); - ADJUST_ARGS(); + nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); + if (nfserr) + goto out; + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &open->op_cinfo); + *p++ = cpu_to_be32(open->op_rflags); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(open->op_bmval[0]); + *p++ = cpu_to_be32(open->op_bmval[1]); + *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(20); - WRITE32(open->op_recall); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_WRITE: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(32); - WRITE32(0); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* * TODO: space_limit's in delegations */ - WRITE32(NFS4_LIMIT_SIZE); - WRITE32(~(u32)0); - WRITE32(~(u32)0); + *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); + *p++ = cpu_to_be32(~(u32)0); + *p++ = cpu_to_be32(~(u32)0); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ switch (open->op_why_no_deleg) { case WND4_CONTENTION: case WND4_RESOURCE: - RESERVE_SPACE(8); - WRITE32(open->op_why_no_deleg); - WRITE32(0); /* deleg signaling not supported yet */ + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + /* deleg signaling not supported yet: */ + *p++ = cpu_to_be32(0); break; default: - RESERVE_SPACE(4); - WRITE32(open->op_why_no_deleg); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); } - ADJUST_ARGS(); break; default: BUG(); @@ -2925,8 +3032,10 @@ out: static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); return nfserr; } @@ -2934,127 +3043,233 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &od->od_stateid); + nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid); return nfserr; } -static __be32 -nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) +static __be32 nfsd4_encode_splice_read( + struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) { + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = xdr->buf; u32 eof; - int v; - struct page *page; - unsigned long maxcount; - long len; - __be32 *p; + int space_left; + __be32 nfserr; + __be32 *p = xdr->p - 2; - if (nfserr) - return nfserr; - if (resp->xbuf->page_len) + /* + * Don't inline pages unless we know there's room for eof, + * count, and possible padding: + */ + if (xdr->end - xdr->p < 3) return nfserr_resource; - RESERVE_SPACE(8); /* eof flag and byte count */ + nfserr = nfsd_splice_read(read->rd_rqstp, file, + read->rd_offset, &maxcount); + if (nfserr) { + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode: + */ + buf->page_len = 0; + return nfserr; + } - maxcount = svc_max_payload(resp->rqstp); - if (maxcount > read->rd_length) - maxcount = read->rd_length; + eof = (read->rd_offset + maxcount >= + read->rd_fhp->fh_dentry->d_inode->i_size); + + *(p++) = htonl(eof); + *(p++) = htonl(maxcount); + + buf->page_len = maxcount; + buf->len += maxcount; + xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Use rest of head for padding and remaining ops: */ + buf->tail[0].iov_base = xdr->p; + buf->tail[0].iov_len = 0; + xdr->iov = buf->tail; + if (maxcount&3) { + int pad = 4 - (maxcount&3); + + *(xdr->p++) = 0; + + buf->tail[0].iov_base += maxcount&3; + buf->tail[0].iov_len = pad; + buf->len += pad; + } + + space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, + buf->buflen - buf->len); + buf->buflen = buf->len + space_left; + xdr->end = (__be32 *)((void *)xdr->end + space_left); + + return 0; +} + +static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 eof; + int v; + int starting_len = xdr->buf->len - 8; + long len; + int thislen; + __be32 nfserr; + __be32 tmp; + __be32 *p; + u32 zzz = 0; + int pad; len = maxcount; v = 0; - while (len > 0) { - page = *(resp->rqstp->rq_next_page); - if (!page) { /* ran out of pages */ - maxcount -= len; - break; - } - resp->rqstp->rq_vec[v].iov_base = page_address(page); - resp->rqstp->rq_vec[v].iov_len = - len < PAGE_SIZE ? len : PAGE_SIZE; - resp->rqstp->rq_next_page++; + + thislen = (void *)xdr->end - (void *)xdr->p; + if (len < thislen) + thislen = len; + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; + v++; + len -= thislen; + + while (len) { + thislen = min_t(long, len, PAGE_SIZE); + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; v++; - len -= PAGE_SIZE; + len -= thislen; } read->rd_vlen = v; - nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, - read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, - &maxcount); - + nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, + read->rd_vlen, &maxcount); if (nfserr) return nfserr; + xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size); - WRITE32(eof); - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); + tmp = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); - } + pad = (maxcount&3) ? 4 - (maxcount&3) : 0; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, + &zzz, pad); return 0; + } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) { - int maxcount; - char *page; + unsigned long maxcount; + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_filp; + int starting_len = xdr->buf->len; + struct raparms *ra; __be32 *p; + __be32 err; if (nfserr) return nfserr; - if (resp->xbuf->page_len) + + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ + if (!p) { + WARN_ON_ONCE(resp->rqstp->rq_splice_ok); return nfserr_resource; - if (!*resp->rqstp->rq_next_page) + } + if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + WARN_ON_ONCE(1); return nfserr_resource; + } + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + if (maxcount > xdr->buf->buflen - xdr->buf->len) + maxcount = xdr->buf->buflen - xdr->buf->len; + if (maxcount > read->rd_length) + maxcount = read->rd_length; + + if (!read->rd_filp) { + err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, + &file, &ra); + if (err) + goto err_truncate; + } + + if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + err = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + err = nfsd4_encode_readv(resp, read, file, maxcount); + + if (!read->rd_filp) + nfsd_put_tmp_read_open(file, ra); + +err_truncate: + if (err) + xdr_truncate_encode(xdr, starting_len); + return err; +} + +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + __be32 wire_count; + int zero = 0; + struct xdr_stream *xdr = &resp->xdr; + int length_offset = xdr->buf->len; + __be32 *p; - page = page_address(*(resp->rqstp->rq_next_page++)); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; maxcount = PAGE_SIZE; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, maxcount); + if (!p) + return nfserr_resource; /* * XXX: By default, the ->readlink() VFS op will truncate symlinks * if they would overflow the buffer. Is this kosher in NFSv4? If * not, one easy fix is: if ->readlink() precisely fills the buffer, * assume that truncation occurred, and return NFS4ERR_RESOURCE. */ - nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount); + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, + (char *)p, &maxcount); if (nfserr == nfserr_isdir) - return nfserr_inval; - if (nfserr) + nfserr = nfserr_inval; + if (nfserr) { + xdr_truncate_encode(xdr, length_offset); return nfserr; - - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); } + + wire_count = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); + xdr_truncate_encode(xdr, length_offset + 4 + maxcount); + if (maxcount & 3) + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, + &zero, 4 - (maxcount&3)); return 0; } @@ -3062,47 +3277,52 @@ static __be32 nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) { int maxcount; + int bytes_left; loff_t offset; - __be32 *page, *savep, *tailbase; + __be64 wire_offset; + struct xdr_stream *xdr = &resp->xdr; + int starting_len = xdr->buf->len; __be32 *p; if (nfserr) return nfserr; - if (resp->xbuf->page_len) - return nfserr_resource; - if (!*resp->rqstp->rq_next_page) - return nfserr_resource; - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - savep = p; + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - tailbase = p; - - maxcount = PAGE_SIZE; - if (maxcount > readdir->rd_maxcount) - maxcount = readdir->rd_maxcount; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) + - (char *)resp->xdr.buf->head[0].iov_base; /* - * Convert from bytes to words, account for the two words already - * written, make sure to leave two words at the end for the next - * pointer and eof field. + * Number of bytes left for directory entries allowing for the + * final 8 bytes of the readdir and a following failed op: + */ + bytes_left = xdr->buf->buflen - xdr->buf->len + - COMPOUND_ERR_SLACK_SPACE - 8; + if (bytes_left < 0) { + nfserr = nfserr_resource; + goto err_no_verf; + } + maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX); + /* + * Note the rfc defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier above + * and the 8 bytes encoded at the end of this function: */ - maxcount = (maxcount >> 2) - 4; - if (maxcount < 0) { - nfserr = nfserr_toosmall; + if (maxcount < 16) { + nfserr = nfserr_toosmall; goto err_no_verf; } + maxcount = min_t(int, maxcount-16, bytes_left); - page = page_address(*(resp->rqstp->rq_next_page++)); + readdir->xdr = xdr; + readdir->rd_maxcount = maxcount; readdir->common.err = 0; - readdir->buflen = maxcount; - readdir->buffer = page; - readdir->offset = NULL; + readdir->cookie_offset = 0; offset = readdir->rd_cookie; nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, @@ -3110,42 +3330,49 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 &readdir->common, nfsd4_encode_dirent); if (nfserr == nfs_ok && readdir->common.err == nfserr_toosmall && - readdir->buffer == page) - nfserr = nfserr_toosmall; + xdr->buf->len == starting_len + 8) { + /* nothing encoded; which limit did we hit?: */ + if (maxcount - 16 < bytes_left) + /* It was the fault of rd_maxcount: */ + nfserr = nfserr_toosmall; + else + /* We ran out of buffer space: */ + nfserr = nfserr_resource; + } if (nfserr) goto err_no_verf; - if (readdir->offset) - xdr_encode_hyper(readdir->offset, offset); + if (readdir->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, + &wire_offset, 8); + } - p = readdir->buffer; + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + goto err_no_verf; + } *p++ = 0; /* no more entries */ *p++ = htonl(readdir->common.err == nfserr_eof); - resp->xbuf->page_len = ((char*)p) - - (char*)page_address(*(resp->rqstp->rq_next_page-1)); - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = tailbase; - resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: - p = savep; - ADJUST_ARGS(); + xdr_truncate_encode(xdr, starting_len); return nfserr; } static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &remove->rm_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); } return nfserr; } @@ -3153,19 +3380,21 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(40); - write_cinfo(&p, &rename->rn_sinfo); - write_cinfo(&p, &rename->rn_tinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); } return nfserr; } static __be32 -nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, __be32 nfserr, struct svc_export *exp) { u32 i, nflavs, supported; @@ -3176,6 +3405,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nfserr) goto out; + nfserr = nfserr_resource; if (exp->ex_nflavors) { flavs = exp->ex_flavors; nflavs = exp->ex_nflavors; @@ -3197,9 +3427,10 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, } supported = 0; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; flavorsp = p++; /* to be backfilled later */ - ADJUST_ARGS(); for (i = 0; i < nflavs; i++) { rpc_authflavor_t pf = flavs[i].pseudoflavor; @@ -3207,18 +3438,20 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4); - WRITE32(RPC_AUTH_GSS); - WRITE32(info.oid.len); - WRITEMEM(info.oid.data, info.oid.len); - WRITE32(info.qop); - WRITE32(info.service); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4 + 4 + + XDR_LEN(info.oid.len) + 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(RPC_AUTH_GSS); + p = xdr_encode_opaque(p, info.oid.data, info.oid.len); + *p++ = cpu_to_be32(info.qop); + *p++ = cpu_to_be32(info.service); } else if (pf < RPC_AUTH_MAXFLAVOR) { supported++; - RESERVE_SPACE(4); - WRITE32(pf); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = cpu_to_be32(pf); } else { if (report) pr_warn("NFS: SECINFO: security flavor %u " @@ -3229,7 +3462,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nflavs != supported) report = false; *flavorsp = htonl(supported); - + nfserr = 0; out: if (exp) exp_put(exp); @@ -3240,14 +3473,18 @@ static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp); } static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo_no_name *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp); } /* @@ -3257,41 +3494,47 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; - RESERVE_SPACE(16); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; if (nfserr) { - WRITE32(3); - WRITE32(0); - WRITE32(0); - WRITE32(0); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } else { - WRITE32(3); - WRITE32(setattr->sa_bmval[0]); - WRITE32(setattr->sa_bmval[1]); - WRITE32(setattr->sa_bmval[2]); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(setattr->sa_bmval[0]); + *p++ = cpu_to_be32(setattr->sa_bmval[1]); + *p++ = cpu_to_be32(setattr->sa_bmval[2]); } - ADJUST_ARGS(); return nfserr; } static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE); - WRITEMEM(&scd->se_clientid, 8); - WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); + p = xdr_encode_opaque_fixed(p, &scd->se_confirm, + NFS4_VERIFIER_SIZE); } else if (nfserr == nfserr_clid_inuse) { - RESERVE_SPACE(8); - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -3299,14 +3542,17 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(16); - WRITE32(write->wr_bytes_written); - WRITE32(write->wr_how_written); - WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -3323,6 +3569,7 @@ static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; char *major_id; char *server_scope; @@ -3338,60 +3585,61 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = utsname()->nodename; server_scope_sz = strlen(server_scope); - RESERVE_SPACE( + p = xdr_reserve_space(xdr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + 4 /* spr_how */); + if (!p) + return nfserr_resource; - WRITEMEM(&exid->clientid, 8); - WRITE32(exid->seqid); - WRITE32(exid->flags); + p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); + *p++ = cpu_to_be32(exid->seqid); + *p++ = cpu_to_be32(exid->flags); - WRITE32(exid->spa_how); - ADJUST_ARGS(); + *p++ = cpu_to_be32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: /* spo_must_enforce, spo_must_allow */ - RESERVE_SPACE(16); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; /* spo_must_enforce bitmap: */ - WRITE32(2); - WRITE32(nfs4_minimal_spo_must_enforce[0]); - WRITE32(nfs4_minimal_spo_must_enforce[1]); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); /* empty spo_must_allow bitmap: */ - WRITE32(0); + *p++ = cpu_to_be32(0); - ADJUST_ARGS(); break; default: WARN_ON_ONCE(1); } - RESERVE_SPACE( + p = xdr_reserve_space(xdr, 8 /* so_minor_id */ + 4 /* so_major_id.len */ + (XDR_QUADLEN(major_id_sz) * 4) + 4 /* eir_server_scope.len */ + (XDR_QUADLEN(server_scope_sz) * 4) + 4 /* eir_server_impl_id.count (0) */); + if (!p) + return nfserr_resource; /* The server_owner struct */ - WRITE64(minor_id); /* Minor id */ + p = xdr_encode_hyper(p, minor_id); /* Minor id */ /* major id */ - WRITE32(major_id_sz); - WRITEMEM(major_id, major_id_sz); + p = xdr_encode_opaque(p, major_id, major_id_sz); /* Server scope */ - WRITE32(server_scope_sz); - WRITEMEM(server_scope, server_scope_sz); + p = xdr_encode_opaque(p, server_scope, server_scope_sz); /* Implementation id */ - WRITE32(0); /* zero length nfs_impl_id4 array */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; } @@ -3399,47 +3647,54 @@ static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(24); - WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(sess->seqid); - WRITE32(sess->flags); - ADJUST_ARGS(); - - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->fore_channel.maxreq_sz); - WRITE32(sess->fore_channel.maxresp_sz); - WRITE32(sess->fore_channel.maxresp_cached); - WRITE32(sess->fore_channel.maxops); - WRITE32(sess->fore_channel.maxreqs); - WRITE32(sess->fore_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, sess->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(sess->seqid); + *p++ = cpu_to_be32(sess->flags); + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->fore_channel.maxops); + *p++ = cpu_to_be32(sess->fore_channel.maxreqs); + *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); if (sess->fore_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->fore_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); } - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->back_channel.maxreq_sz); - WRITE32(sess->back_channel.maxresp_sz); - WRITE32(sess->back_channel.maxresp_cached); - WRITE32(sess->back_channel.maxops); - WRITE32(sess->back_channel.maxreqs); - WRITE32(sess->back_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->back_channel.maxops); + *p++ = cpu_to_be32(sess->back_channel.maxreqs); + *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); if (sess->back_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->back_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); } return 0; } @@ -3448,22 +3703,25 @@ static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); - WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(seq->seqid); - WRITE32(seq->slotid); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, seq->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(seq->seqid); + *p++ = cpu_to_be32(seq->slotid); /* Note slotid's are numbered from zero: */ - WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ - WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ - WRITE32(seq->status_flags); + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ + *p++ = cpu_to_be32(seq->status_flags); - ADJUST_ARGS(); - resp->cstate.datap = p; /* DRC cache data pointer */ + resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ return 0; } @@ -3471,20 +3729,22 @@ static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); + p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); + if (!p) + return nfserr_resource; *p++ = htonl(test_stateid->ts_num_ids); list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { *p++ = stateid->ts_id_status; } - ADJUST_ARGS(); return nfserr; } @@ -3563,81 +3823,99 @@ static nfsd4_enc nfsd4_enc_ops[] = { }; /* - * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation with pad. - * - * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() - * which was specified at nfsd4_operation, else pad is zero. - * - * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. + * Calculate whether we still have space to encode repsize bytes. + * There are two considerations: + * - For NFS versions >=4.1, the size of the reply must stay within + * session limits + * - For all NFS versions, we must stay within limited preallocated + * buffer space. * - * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so - * will be at least a page and will therefore hold the xdr_buf head. + * This is called before the operation is processed, so can only provide + * an upper estimate. For some nonidempotent operations (such as + * getattr), it's not necessarily a problem if that estimate is wrong, + * as we can fail it after processing without significant side effects. */ -__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) { - struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_session *session = NULL; + struct xdr_buf *buf = &resp->rqstp->rq_res; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0; + if (buf->len + respsize <= buf->buflen) + return nfs_ok; if (!nfsd4_has_session(&resp->cstate)) - return 0; - - session = resp->cstate.session; - - if (xb->page_len == 0) { - length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; - } else { - if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) - tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; - - length = xb->head[0].iov_len + xb->page_len + tlen + pad; - } - dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, - length, xb->page_len, tlen, pad); - - if (length > session->se_fchannel.maxresp_sz) - return nfserr_rep_too_big; - - if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) && - length > session->se_fchannel.maxresp_cached) + return nfserr_resource; + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) { + WARN_ON_ONCE(1); return nfserr_rep_too_big_to_cache; - - return 0; + } + return nfserr_rep_too_big; } void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { + struct xdr_stream *xdr = &resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; - __be32 *statp; + struct svc_rqst *rqstp = resp->rqstp; + int post_err_offset; + nfsd4_enc encoder; __be32 *p; - RESERVE_SPACE(8); - WRITE32(op->opnum); - statp = p++; /* to be backfilled at the end */ - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + post_err_offset = xdr->buf->len; if (op->opnum == OP_ILLEGAL) goto status; BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); - op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); + encoder = nfsd4_enc_ops[op->opnum]; + op->status = encoder(resp, op->status, &op->u); + xdr_commit_encode(xdr); + /* nfsd4_check_resp_size guarantees enough room for error status */ - if (!op->status) - op->status = nfsd4_check_resp_size(resp, 0); + if (!op->status) { + int space_needed = 0; + if (!nfsd4_last_compound_op(rqstp)) + space_needed = COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, space_needed); + } + if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { + struct nfsd4_slot *slot = resp->cstate.slot; + + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) + op->status = nfserr_rep_too_big_to_cache; + else + op->status = nfserr_rep_too_big; + } + if (op->status == nfserr_resource || + op->status == nfserr_rep_too_big || + op->status == nfserr_rep_too_big_to_cache) { + /* + * The operation may have already been encoded or + * partially encoded. No op returns anything additional + * in the case of one of these three errors, so we can + * just truncate back to after the status. But it's a + * bug if we had to do this on a non-idempotent op: + */ + warn_on_nonidempotent_op(op); + xdr_truncate_encode(xdr, post_err_offset); + } if (so) { + int len = xdr->buf->len - post_err_offset; + so->so_replay.rp_status = op->status; - so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); - memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen); + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + so->so_replay.rp_buf, len); } status: - /* - * Note: We write the status directly, instead of using WRITE32(), - * since it is already in network byte order. - */ - *statp = op->status; + /* Note that op->status is already in network byte order: */ + write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); } /* @@ -3649,21 +3927,22 @@ status: * called with nfs4_lock_state() held */ void -nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { __be32 *p; struct nfs4_replay *rp = op->replay; BUG_ON(!rp); - RESERVE_SPACE(8); - WRITE32(op->opnum); + p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); *p++ = rp->rp_status; /* already xdr'ed */ - ADJUST_ARGS(); - RESERVE_SPACE(rp->rp_buflen); - WRITEMEM(rp->rp_buf, rp->rp_buflen); - ADJUST_ARGS(); + p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); } int @@ -3720,19 +3999,19 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo * All that remains is to write the tag and operation count... */ struct nfsd4_compound_state *cs = &resp->cstate; - struct kvec *iov; + struct xdr_buf *buf = resp->xdr.buf; + + WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + + buf->tail[0].iov_len); + + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + p = resp->tagp; *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); - if (rqstp->rq_res.page_len) - iov = &rqstp->rq_res.tail[0]; - else - iov = &rqstp->rq_res.head[0]; - iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; - BUG_ON(iov->iov_len > PAGE_SIZE); if (nfsd4_has_session(cs)) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp = cs->session->se_client; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index f8f060ffbf4f..6040da8830ff 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp) hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); } -static inline bool -nfsd_cache_entry_expired(struct svc_cacherep *rp) -{ - return rp->c_state != RC_INPROG && - time_after(jiffies, rp->c_timestamp + RC_EXPIRE); -} - /* * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. @@ -242,8 +235,14 @@ prune_cache_entries(void) long freed = 0; list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { - if (!nfsd_cache_entry_expired(rp) && - num_drc_entries <= max_drc_entries) + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + if (num_drc_entries <= max_drc_entries && + time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; nfsd_reply_cache_free_locked(rp); freed++; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f34d9de802ab..51844048937f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1179,7 +1179,6 @@ static int __init init_nfsd(void) retval = nfsd4_init_slabs(); if (retval) goto out_unregister_pernet; - nfs4_state_init(); retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ if (retval) goto out_free_slabs; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 479eb681c27c..847daf37e566 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -15,11 +15,20 @@ #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> +#include <linux/sunrpc/svc.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/nfsd/debug.h> -#include <linux/nfsd/export.h> -#include <linux/nfsd/stats.h> +#include <uapi/linux/nfsd/debug.h> + +#include "stats.h" +#include "export.h" + +#undef ifdebug +#ifdef NFSD_DEBUG +# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) +#else +# define ifdebug(flag) if (0) +#endif /* * nfsd version @@ -106,7 +115,6 @@ static inline int nfsd_v4client(struct svc_rqst *rq) */ #ifdef CONFIG_NFSD_V4 extern unsigned long max_delegations; -void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -117,7 +125,6 @@ void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); #else -static inline void nfs4_state_init(void) { } static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3c37b160dcad..ec8393418154 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -88,9 +88,8 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, /* Check if the request originated from a secure port. */ if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk(KERN_WARNING - "nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return nfserr_perm; } @@ -169,8 +168,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) data_left -= len; if (data_left < 0) return error; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); - fid = (struct fid *)(fh->fh_auth + len); + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); + fid = (struct fid *)(fh->fh_fsid + len); } else { __u32 tfh[2]; dev_t xdev; @@ -385,7 +384,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, { if (dentry != exp->ex_path.dentry) { struct fid *fid = (struct fid *) - (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1); + (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); @@ -513,7 +512,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ struct inode * inode = dentry->d_inode; - __u32 *datap; dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", @@ -557,17 +555,16 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); } else { - int len; + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; fhp->fh_handle.fh_auth_type = 0; - datap = fhp->fh_handle.fh_auth+0; - mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fhp->fh_handle.fh_fsid, + ex_dev, exp->ex_path.dentry->d_inode->i_ino, exp->ex_fsid, exp->ex_uuid); - len = key_len(fhp->fh_handle.fh_fsid_type); - datap += len/4; - fhp->fh_handle.fh_size = 4 + len; - if (inode) _fh_update(fhp, exp, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index ad67964d0bb1..2e89e70ac15c 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -1,9 +1,58 @@ -/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ +/* + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + * + * This file describes the layout of the file handles as passed + * over the wire. + */ +#ifndef _LINUX_NFSD_NFSFH_H +#define _LINUX_NFSD_NFSFH_H + +#include <linux/sunrpc/svc.h> +#include <uapi/linux/nfsd/nfsfh.h> + +static inline __u32 ino_t_to_u32(ino_t ino) +{ + return (__u32) ino; +} + +static inline ino_t u32_to_ino_t(__u32 uino) +{ + return (ino_t) uino; +} -#ifndef _LINUX_NFSD_FH_INT_H -#define _LINUX_NFSD_FH_INT_H +/* + * This is the internal representation of an NFS handle used in knfsd. + * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. + */ +typedef struct svc_fh { + struct knfsd_fh fh_handle; /* FH data */ + struct dentry * fh_dentry; /* validated dentry */ + struct svc_export * fh_export; /* export pointer */ + int fh_maxsize; /* max size for fh_handle */ + + unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ + +#ifdef CONFIG_NFSD_V3 + unsigned char fh_post_saved; /* post-op attrs saved */ + unsigned char fh_pre_saved; /* pre-op attrs saved */ + + /* Pre-op attributes saved during fh_lock */ + __u64 fh_pre_size; /* size before operation */ + struct timespec fh_pre_mtime; /* mtime before oper */ + struct timespec fh_pre_ctime; /* ctime before oper */ + /* + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) + * to find out if it is valid. + */ + u64 fh_pre_change; + + /* Post-op attributes saved in fh_unlock */ + struct kstat fh_post_attr; /* full attrs after operation */ + u64 fh_post_change; /* nfsv4 change; see above */ +#endif /* CONFIG_NFSD_V3 */ -#include <linux/nfsd/nfsfh.h> +} svc_fh; enum nfsd_fsid { FSID_DEV = 0, @@ -215,4 +264,4 @@ fh_unlock(struct svc_fh *fhp) } } -#endif /* _LINUX_NFSD_FH_INT_H */ +#endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9a4a5f9e7468..1879e43f2868 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -591,12 +591,6 @@ nfsd(void *vrqstp) nfsdstats.th_cnt++; mutex_unlock(&nfsd_mutex); - /* - * We want less throttling in balance_dirty_pages() so that nfs to - * localhost doesn't cause nfsd to lock up due to all the client's - * dirty pages. - */ - current->flags |= PF_LESS_THROTTLE; set_freezable(); /* diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 9c769a47ac5a..1ac306b769df 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -214,7 +214,8 @@ nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) int nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -248,7 +249,8 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->offset = ntohl(*p++); @@ -281,7 +283,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, hdr, dlen; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p++; /* beginoffset */ @@ -355,7 +358,8 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, int nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -391,7 +395,8 @@ int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 424d8f5f2317..374c66283ac5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -37,7 +37,6 @@ #include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> -#include <linux/nfsd/nfsfh.h> #include "nfsfh.h" typedef struct { @@ -123,7 +122,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of operations per session compound */ #define NFSD_MAX_OPS_PER_COMPOUND 16 /* Maximum session per slot cache size */ -#define NFSD_SLOT_CACHE_SIZE 1024 +#define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ #define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 #define NFSD_MAX_MEM_PER_SESSION \ @@ -464,8 +463,6 @@ extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); -extern void nfs4_free_openowner(struct nfs4_openowner *); -extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_init_callback(struct nfsd4_callback *); extern void nfsd4_probe_callback(struct nfs4_client *clp); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 6d4521feb6e3..cd90878a76aa 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/sunrpc/stats.h> -#include <linux/nfsd/stats.h> #include <net/net_namespace.h> #include "nfsd.h" diff --git a/include/linux/nfsd/stats.h b/fs/nfsd/stats.h index e75b2544ff12..a5c944b771c6 100644 --- a/include/linux/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -1,12 +1,10 @@ /* - * linux/include/linux/nfsd/stats.h - * * Statistics for NFS server. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#ifndef LINUX_NFSD_STATS_H -#define LINUX_NFSD_STATS_H +#ifndef _NFSD_STATS_H +#define _NFSD_STATS_H #include <uapi/linux/nfsd/stats.h> @@ -42,4 +40,4 @@ extern struct svc_stat nfsd_svcstats; void nfsd_stat_init(void); void nfsd_stat_shutdown(void); -#endif /* LINUX_NFSD_STATS_H */ +#endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 16f0673a423c..140c496f612c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -820,55 +820,54 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) { - mm_segment_t oldfs; - __be32 err; - int host_err; - - err = nfserr_perm; - - if (file->f_op->splice_read && rqstp->rq_splice_ok) { - struct splice_desc sd = { - .len = 0, - .total_len = *count, - .pos = offset, - .u.data = rqstp, - }; - - rqstp->rq_next_page = rqstp->rq_respages + 1; - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - } else { - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); - set_fs(oldfs); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; - err = 0; fsnotify_access(file); + return 0; } else - err = nfserrno(host_err); - return err; + return nfserrno(host_err); +} + +int nfsd_splice_read(struct svc_rqst *rqstp, + struct file *file, loff_t offset, unsigned long *count) +{ + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + int host_err; + + rqstp->rq_next_page = rqstp->rq_respages + 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + return nfsd_finish_read(file, count, host_err); } -static void kill_suid(struct dentry *dentry) +int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, + unsigned long *count) { - struct iattr ia; - ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + mm_segment_t oldfs; + int host_err; - mutex_lock(&dentry->d_inode->i_mutex); - /* - * Note we call this on write, so notify_change will not - * encounter any conflicting delegations: - */ - notify_change(dentry, &ia, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + oldfs = get_fs(); + set_fs(KERNEL_DS); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + return nfsd_finish_read(file, count, host_err); +} + +static __be32 +nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + if (file->f_op->splice_read && rqstp->rq_splice_ok) + return nfsd_splice_read(rqstp, file, offset, count); + else + return nfsd_readv(file, offset, vec, vlen, count); } /* @@ -922,6 +921,16 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + unsigned int pflags = current->flags; + + if (rqstp->rq_local) + /* + * We want less throttling in balance_dirty_pages() + * and shrink_inactive_list() so that nfs to + * localhost doesn't cause nfsd to lock up due to all + * the client's dirty pages or its congested queue. + */ + current->flags |= PF_LESS_THROTTLE; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -942,10 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += host_err; fsnotify_modify(file); - /* clear setuid/setgid flag after write */ - if (inode->i_mode & (S_ISUID | S_ISGID)) - kill_suid(dentry); - if (stable) { if (use_wgather) host_err = wait_for_concurrent_writes(file); @@ -959,36 +964,33 @@ out_nfserr: err = 0; else err = nfserrno(host_err); + if (rqstp->rq_local) + tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. - * N.B. After this call fhp needs an fh_put - */ -__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file **file, struct raparms **ra) { - struct file *file; struct inode *inode; - struct raparms *ra; __be32 err; - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); if (err) return err; - inode = file_inode(file); + inode = file_inode(*file); /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - if (ra && ra->p_set) - file->f_ra = ra->p_ra; - - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + if (*ra && (*ra)->p_set) + (*file)->f_ra = (*ra)->p_ra; + return nfs_ok; +} +void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) +{ /* Write back readahead params */ if (ra) { struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; @@ -998,28 +1000,29 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra->p_count--; spin_unlock(&rab->pb_lock); } - nfsd_close(file); - return err; } -/* As above, but use the provided file descriptor. */ -__be32 -nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *count) +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - __be32 err; + struct file *file; + struct raparms *ra; + __be32 err; + + err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + if (err) + return err; + + err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + nfsd_put_tmp_read_open(file, ra); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - } else /* Note file may still be NULL in NFSv4 special stateid case: */ - err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); -out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fbe90bdb2214..91b6ae3f658b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -70,10 +70,16 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); void nfsd_close(struct file *); +struct raparms; +__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, + struct file **, struct raparms **); +void nfsd_put_tmp_read_open(struct file *, struct raparms *); +int nfsd_splice_read(struct svc_rqst *, + struct file *, loff_t, unsigned long *); +int nfsd_readv(struct file *, loff_t, struct kvec *, int, + unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, - loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 5ea7df305083..18cbb6d9c8a9 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -58,7 +58,7 @@ struct nfsd4_compound_state { /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; - __be32 *datap; + int data_offset; size_t iovlen; u32 minorversion; __be32 status; @@ -287,9 +287,8 @@ struct nfsd4_readdir { struct svc_fh * rd_fhp; /* response */ struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + struct xdr_stream *xdr; + int cookie_offset; }; struct nfsd4_release_lockowner { @@ -506,9 +505,7 @@ struct nfsd4_compoundargs { struct nfsd4_compoundres { /* scratch variables for XDR encode */ - __be32 * p; - __be32 * end; - struct xdr_buf * xbuf; + struct xdr_stream xdr; struct svc_rqst * rqstp; u32 taglen; @@ -538,6 +535,9 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); +void warn_on_nonidempotent_op(struct nfsd4_op *op); + #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) static inline void @@ -563,10 +563,11 @@ int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); -void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); -__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int countp, - u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid *setclid); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0b18776b075e..6152cbe353e8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1215,7 +1215,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !inode_capable(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* diff --git a/include/linux/capability.h b/include/linux/capability.h index a6ee1f9a5018..84b13ad67c1c 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -210,7 +210,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); -extern bool inode_capable(const struct inode *inode, int cap); +extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); /* audit system wants to get cap info from files as well */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index dcaad79f54ed..219d79627c05 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -17,13 +17,13 @@ #include <linux/fs.h> #include <linux/kref.h> #include <linux/utsname.h> -#include <linux/nfsd/nfsfh.h> #include <linux/lockd/bind.h> #include <linux/lockd/xdr.h> #ifdef CONFIG_LOCKD_V4 #include <linux/lockd/xdr4.h> #endif #include <linux/lockd/debug.h> +#include <linux/sunrpc/svc.h> /* * Version string diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index b73027298b3a..d424b9de3aff 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -63,12 +63,12 @@ struct mmc_ext_csd { unsigned int power_off_longtime; /* Units: ms */ u8 power_off_notification; /* state */ unsigned int hs_max_dtr; + unsigned int hs200_max_dtr; #define MMC_HIGH_26_MAX_DTR 26000000 #define MMC_HIGH_52_MAX_DTR 52000000 #define MMC_HIGH_DDR_MAX_DTR 52000000 #define MMC_HS200_MAX_DTR 200000000 unsigned int sectors; - unsigned int card_type; unsigned int hc_erase_size; /* In sectors */ unsigned int hc_erase_timeout; /* In milliseconds */ unsigned int sec_trim_mult; /* Secure trim multiplier */ @@ -110,6 +110,7 @@ struct mmc_ext_csd { u8 raw_pwr_cl_200_360; /* 237 */ u8 raw_pwr_cl_ddr_52_195; /* 238 */ u8 raw_pwr_cl_ddr_52_360; /* 239 */ + u8 raw_pwr_cl_ddr_200_360; /* 253 */ u8 raw_bkops_status; /* 246 */ u8 raw_sectors[4]; /* 212 - 4 bytes */ @@ -194,6 +195,7 @@ struct sdio_cis { }; struct mmc_host; +struct mmc_ios; struct sdio_func; struct sdio_func_tuple; @@ -250,15 +252,11 @@ struct mmc_card { unsigned int state; /* (our) card state */ #define MMC_STATE_PRESENT (1<<0) /* present in sysfs */ #define MMC_STATE_READONLY (1<<1) /* card is read-only */ -#define MMC_STATE_HIGHSPEED (1<<2) /* card is in high speed mode */ -#define MMC_STATE_BLOCKADDR (1<<3) /* card uses block-addressing */ -#define MMC_STATE_HIGHSPEED_DDR (1<<4) /* card is in high speed mode */ -#define MMC_STATE_ULTRAHIGHSPEED (1<<5) /* card is in ultra high speed mode */ -#define MMC_CARD_SDXC (1<<6) /* card is SDXC */ -#define MMC_CARD_REMOVED (1<<7) /* card has been removed */ -#define MMC_STATE_HIGHSPEED_200 (1<<8) /* card is in HS200 mode */ -#define MMC_STATE_DOING_BKOPS (1<<10) /* card is doing BKOPS */ -#define MMC_STATE_SUSPENDED (1<<11) /* card is suspended */ +#define MMC_STATE_BLOCKADDR (1<<2) /* card uses block-addressing */ +#define MMC_CARD_SDXC (1<<3) /* card is SDXC */ +#define MMC_CARD_REMOVED (1<<4) /* card has been removed */ +#define MMC_STATE_DOING_BKOPS (1<<5) /* card is doing BKOPS */ +#define MMC_STATE_SUSPENDED (1<<6) /* card is suspended */ unsigned int quirks; /* card quirks */ #define MMC_QUIRK_LENIENT_FN0 (1<<0) /* allow SDIO FN0 writes outside of the VS CCCR range */ #define MMC_QUIRK_BLKSZ_FOR_BYTE_MODE (1<<1) /* use func->cur_blksize */ @@ -301,6 +299,7 @@ struct mmc_card { struct sdio_func_tuple *tuples; /* unknown common tuples */ unsigned int sd_bus_speed; /* Bus Speed Mode set for the card */ + unsigned int mmc_avail_type; /* supported device type by both host and card */ struct dentry *debugfs_root; struct mmc_part part[MMC_NUM_PHY_PARTITION]; /* physical partitions */ @@ -353,7 +352,7 @@ struct mmc_fixup { #define CID_OEMID_ANY ((unsigned short) -1) #define CID_NAME_ANY (NULL) -#define END_FIXUP { 0 } +#define END_FIXUP { NULL } #define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end, \ _cis_vendor, _cis_device, \ @@ -418,11 +417,7 @@ static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data) #define mmc_card_present(c) ((c)->state & MMC_STATE_PRESENT) #define mmc_card_readonly(c) ((c)->state & MMC_STATE_READONLY) -#define mmc_card_highspeed(c) ((c)->state & MMC_STATE_HIGHSPEED) -#define mmc_card_hs200(c) ((c)->state & MMC_STATE_HIGHSPEED_200) #define mmc_card_blockaddr(c) ((c)->state & MMC_STATE_BLOCKADDR) -#define mmc_card_ddr_mode(c) ((c)->state & MMC_STATE_HIGHSPEED_DDR) -#define mmc_card_uhs(c) ((c)->state & MMC_STATE_ULTRAHIGHSPEED) #define mmc_card_ext_capacity(c) ((c)->state & MMC_CARD_SDXC) #define mmc_card_removed(c) ((c) && ((c)->state & MMC_CARD_REMOVED)) #define mmc_card_doing_bkops(c) ((c)->state & MMC_STATE_DOING_BKOPS) @@ -430,11 +425,7 @@ static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data) #define mmc_card_set_present(c) ((c)->state |= MMC_STATE_PRESENT) #define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY) -#define mmc_card_set_highspeed(c) ((c)->state |= MMC_STATE_HIGHSPEED) -#define mmc_card_set_hs200(c) ((c)->state |= MMC_STATE_HIGHSPEED_200) #define mmc_card_set_blockaddr(c) ((c)->state |= MMC_STATE_BLOCKADDR) -#define mmc_card_set_ddr_mode(c) ((c)->state |= MMC_STATE_HIGHSPEED_DDR) -#define mmc_card_set_uhs(c) ((c)->state |= MMC_STATE_ULTRAHIGHSPEED) #define mmc_card_set_ext_capacity(c) ((c)->state |= MMC_CARD_SDXC) #define mmc_card_set_removed(c) ((c)->state |= MMC_CARD_REMOVED) #define mmc_card_set_doing_bkops(c) ((c)->state |= MMC_STATE_DOING_BKOPS) diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 6ce7d2cd3c7a..babaea93bca6 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -248,20 +248,6 @@ struct dw_mci_board { /* delay in mS before detecting cards after interrupt */ u32 detect_delay_ms; - int (*init)(u32 slot_id, irq_handler_t , void *); - int (*get_ro)(u32 slot_id); - int (*get_cd)(u32 slot_id); - int (*get_ocr)(u32 slot_id); - int (*get_bus_wd)(u32 slot_id); - /* - * Enable power to selected slot and set voltage to desired level. - * Voltage levels are specified using MMC_VDD_xxx defines defined - * in linux/mmc/host.h file. - */ - void (*setpower)(u32 slot_id, u32 volt); - void (*exit)(u32 slot_id); - void (*select_slot)(u32 slot_id); - struct dw_mci_dma_ops *dma_ops; struct dma_pdata *data; struct block_settings *blk_settings; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index cb61ea4d6945..7960424d0bc0 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -17,6 +17,7 @@ #include <linux/fault-inject.h> #include <linux/mmc/core.h> +#include <linux/mmc/card.h> #include <linux/mmc/pm.h> struct mmc_ios { @@ -58,13 +59,9 @@ struct mmc_ios { #define MMC_TIMING_UHS_SDR50 5 #define MMC_TIMING_UHS_SDR104 6 #define MMC_TIMING_UHS_DDR50 7 -#define MMC_TIMING_MMC_HS200 8 - -#define MMC_SDR_MODE 0 -#define MMC_1_2V_DDR_MODE 1 -#define MMC_1_8V_DDR_MODE 2 -#define MMC_1_2V_SDR_MODE 3 -#define MMC_1_8V_SDR_MODE 4 +#define MMC_TIMING_MMC_DDR52 8 +#define MMC_TIMING_MMC_HS200 9 +#define MMC_TIMING_MMC_HS400 10 unsigned char signal_voltage; /* signalling voltage (1.8V or 3.3V) */ @@ -136,6 +133,9 @@ struct mmc_host_ops { /* The tuning command opcode value is different for SD and eMMC cards */ int (*execute_tuning)(struct mmc_host *host, u32 opcode); + + /* Prepare HS400 target operating frequency depending host driver */ + int (*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios); int (*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv); void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); @@ -278,6 +278,11 @@ struct mmc_host { #define MMC_CAP2_PACKED_CMD (MMC_CAP2_PACKED_RD | \ MMC_CAP2_PACKED_WR) #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14) /* Don't power up before scan */ +#define MMC_CAP2_HS400_1_8V (1 << 15) /* Can support HS400 1.8V */ +#define MMC_CAP2_HS400_1_2V (1 << 16) /* Can support HS400 1.2V */ +#define MMC_CAP2_HS400 (MMC_CAP2_HS400_1_8V | \ + MMC_CAP2_HS400_1_2V) +#define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17) mmc_pm_flag_t pm_caps; /* supported pm features */ @@ -318,6 +323,8 @@ struct mmc_host { int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ + bool trigger_card_event; /* card_event necessary */ + struct mmc_card *card; /* device attached to this host */ wait_queue_head_t wq; @@ -391,12 +398,13 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host) wake_up_process(host->sdio_irq_thread); } +void sdio_run_irqs(struct mmc_host *host); + #ifdef CONFIG_REGULATOR int mmc_regulator_get_ocrmask(struct regulator *supply); int mmc_regulator_set_ocr(struct mmc_host *mmc, struct regulator *supply, unsigned short vdd_bit); -int mmc_regulator_get_supply(struct mmc_host *mmc); #else static inline int mmc_regulator_get_ocrmask(struct regulator *supply) { @@ -409,13 +417,10 @@ static inline int mmc_regulator_set_ocr(struct mmc_host *mmc, { return 0; } - -static inline int mmc_regulator_get_supply(struct mmc_host *mmc) -{ - return 0; -} #endif +int mmc_regulator_get_supply(struct mmc_host *mmc); + int mmc_pm_notify(struct notifier_block *notify_block, unsigned long, void *); static inline int mmc_card_is_removable(struct mmc_host *host) @@ -475,4 +480,32 @@ static inline unsigned int mmc_host_clk_rate(struct mmc_host *host) return host->ios.clock; } #endif + +static inline int mmc_card_hs(struct mmc_card *card) +{ + return card->host->ios.timing == MMC_TIMING_SD_HS || + card->host->ios.timing == MMC_TIMING_MMC_HS; +} + +static inline int mmc_card_uhs(struct mmc_card *card) +{ + return card->host->ios.timing >= MMC_TIMING_UHS_SDR12 && + card->host->ios.timing <= MMC_TIMING_UHS_DDR50; +} + +static inline bool mmc_card_hs200(struct mmc_card *card) +{ + return card->host->ios.timing == MMC_TIMING_MMC_HS200; +} + +static inline bool mmc_card_ddr52(struct mmc_card *card) +{ + return card->host->ios.timing == MMC_TIMING_MMC_DDR52; +} + +static inline bool mmc_card_hs400(struct mmc_card *card) +{ + return card->host->ios.timing == MMC_TIMING_MMC_HS400; +} + #endif /* LINUX_MMC_HOST_H */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 50bcde3677ca..64ec963ed347 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -325,6 +325,7 @@ struct _mmc_csd { #define EXT_CSD_POWER_OFF_LONG_TIME 247 /* RO */ #define EXT_CSD_GENERIC_CMD6_TIME 248 /* RO */ #define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */ +#define EXT_CSD_PWR_CL_DDR_200_360 253 /* RO */ #define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */ #define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */ #define EXT_CSD_MAX_PACKED_WRITES 500 /* RO */ @@ -354,18 +355,25 @@ struct _mmc_csd { #define EXT_CSD_CMD_SET_SECURE (1<<1) #define EXT_CSD_CMD_SET_CPSECURE (1<<2) -#define EXT_CSD_CARD_TYPE_26 (1<<0) /* Card can run at 26MHz */ -#define EXT_CSD_CARD_TYPE_52 (1<<1) /* Card can run at 52MHz */ -#define EXT_CSD_CARD_TYPE_MASK 0x3F /* Mask out reserved bits */ +#define EXT_CSD_CARD_TYPE_HS_26 (1<<0) /* Card can run at 26MHz */ +#define EXT_CSD_CARD_TYPE_HS_52 (1<<1) /* Card can run at 52MHz */ +#define EXT_CSD_CARD_TYPE_HS (EXT_CSD_CARD_TYPE_HS_26 | \ + EXT_CSD_CARD_TYPE_HS_52) #define EXT_CSD_CARD_TYPE_DDR_1_8V (1<<2) /* Card can run at 52MHz */ /* DDR mode @1.8V or 3V I/O */ #define EXT_CSD_CARD_TYPE_DDR_1_2V (1<<3) /* Card can run at 52MHz */ /* DDR mode @1.2V I/O */ #define EXT_CSD_CARD_TYPE_DDR_52 (EXT_CSD_CARD_TYPE_DDR_1_8V \ | EXT_CSD_CARD_TYPE_DDR_1_2V) -#define EXT_CSD_CARD_TYPE_SDR_1_8V (1<<4) /* Card can run at 200MHz */ -#define EXT_CSD_CARD_TYPE_SDR_1_2V (1<<5) /* Card can run at 200MHz */ +#define EXT_CSD_CARD_TYPE_HS200_1_8V (1<<4) /* Card can run at 200MHz */ +#define EXT_CSD_CARD_TYPE_HS200_1_2V (1<<5) /* Card can run at 200MHz */ /* SDR mode @1.2V I/O */ +#define EXT_CSD_CARD_TYPE_HS200 (EXT_CSD_CARD_TYPE_HS200_1_8V | \ + EXT_CSD_CARD_TYPE_HS200_1_2V) +#define EXT_CSD_CARD_TYPE_HS400_1_8V (1<<6) /* Card can run at 200MHz DDR, 1.8V */ +#define EXT_CSD_CARD_TYPE_HS400_1_2V (1<<7) /* Card can run at 200MHz DDR, 1.2V */ +#define EXT_CSD_CARD_TYPE_HS400 (EXT_CSD_CARD_TYPE_HS400_1_8V | \ + EXT_CSD_CARD_TYPE_HS400_1_2V) #define EXT_CSD_BUS_WIDTH_1 0 /* Card is in 1 bit mode */ #define EXT_CSD_BUS_WIDTH_4 1 /* Card is in 4 bit mode */ @@ -373,6 +381,11 @@ struct _mmc_csd { #define EXT_CSD_DDR_BUS_WIDTH_4 5 /* Card is in 4 bit DDR mode */ #define EXT_CSD_DDR_BUS_WIDTH_8 6 /* Card is in 8 bit DDR mode */ +#define EXT_CSD_TIMING_BC 0 /* Backwards compatility */ +#define EXT_CSD_TIMING_HS 1 /* High speed */ +#define EXT_CSD_TIMING_HS200 2 /* HS200 */ +#define EXT_CSD_TIMING_HS400 3 /* HS400 */ + #define EXT_CSD_SEC_ER_EN BIT(0) #define EXT_CSD_SEC_BD_BLK_EN BIT(2) #define EXT_CSD_SEC_GB_CL_EN BIT(4) diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h index 7be12b883485..08abe9941884 100644 --- a/include/linux/mmc/sdhci.h +++ b/include/linux/mmc/sdhci.h @@ -57,12 +57,8 @@ struct sdhci_host { #define SDHCI_QUIRK_BROKEN_CARD_DETECTION (1<<15) /* Controller reports inverted write-protect state */ #define SDHCI_QUIRK_INVERTED_WRITE_PROTECT (1<<16) -/* Controller has nonstandard clock management */ -#define SDHCI_QUIRK_NONSTANDARD_CLOCK (1<<17) /* Controller does not like fast PIO transfers */ #define SDHCI_QUIRK_PIO_NEEDS_DELAY (1<<18) -/* Controller losing signal/interrupt enable states after reset */ -#define SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET (1<<19) /* Controller has to be forced to use block size of 2048 bytes */ #define SDHCI_QUIRK_FORCE_BLK_SZ_2048 (1<<20) /* Controller cannot do multi-block transfers */ @@ -147,6 +143,7 @@ struct sdhci_host { bool runtime_suspended; /* Host is runtime suspended */ bool bus_on; /* Bus power prevents runtime suspend */ + bool preset_enabled; /* Preset is enabled */ struct mmc_request *mrq; /* Current request */ struct mmc_command *cmd; /* Current command */ @@ -164,8 +161,7 @@ struct sdhci_host { dma_addr_t adma_addr; /* Mapped ADMA descr. table */ dma_addr_t align_addr; /* Mapped bounce buffer */ - struct tasklet_struct card_tasklet; /* Tasklet structures */ - struct tasklet_struct finish_tasklet; + struct tasklet_struct finish_tasklet; /* Tasklet structures */ struct timer_list timer; /* Timer for timeouts */ @@ -177,6 +173,13 @@ struct sdhci_host { unsigned int ocr_avail_mmc; u32 ocr_mask; /* available voltages */ + unsigned timing; /* Current timing */ + + u32 thread_isr; + + /* cached registers */ + u32 ier; + wait_queue_head_t buf_ready_int; /* Waitqueue for Buffer Read Ready interrupt */ unsigned int tuning_done; /* Condition flag set when CMD19 succeeds */ diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 3e794c12e90a..610af5155ef2 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -46,6 +46,9 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc enum nfs3_stable_how { NFS_UNSTABLE = 0, NFS_DATA_SYNC = 1, - NFS_FILE_SYNC = 2 + NFS_FILE_SYNC = 2, + + /* used by direct.c to mark verf as invalid */ + NFS_INVALID_STABLE_HOW = -1 }; #endif /* _LINUX_NFS_H */ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 12c2cb947df5..a1e3064a8d99 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -399,8 +399,6 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) -#define FATTR4_WORD2_CHANGE_SECURITY_LABEL \ - (1UL << 17) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fa6918b0f829..919576b8e2cf 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -520,7 +520,6 @@ extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); -extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); /* * Try to write back everything synchronously (but check the @@ -553,7 +552,6 @@ nfs_have_writebacks(struct inode *inode) extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); -extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 92ce5783b707..7d9096d95d4a 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -22,12 +22,17 @@ * Valid flags for a dirty buffer */ enum { - PG_BUSY = 0, - PG_MAPPED, - PG_CLEAN, - PG_NEED_COMMIT, - PG_NEED_RESCHED, - PG_COMMIT_TO_DS, + PG_BUSY = 0, /* nfs_{un}lock_request */ + PG_MAPPED, /* page private set for buffered io */ + PG_CLEAN, /* write succeeded */ + PG_COMMIT_TO_DS, /* used by pnfs layouts */ + PG_INODE_REF, /* extra ref held by inode (head req only) */ + PG_HEADLOCK, /* page group lock of wb_head */ + PG_TEARDOWN, /* page group sync for destroy */ + PG_UNLOCKPAGE, /* page group sync bit in read path */ + PG_UPTODATE, /* page group sync bit in read path */ + PG_WB_END, /* page group sync bit in write path */ + PG_REMOVE, /* page group sync bit in write path */ }; struct nfs_inode; @@ -43,15 +48,29 @@ struct nfs_page { struct kref wb_kref; /* reference count */ unsigned long wb_flags; struct nfs_write_verifier wb_verf; /* Commit cookie */ + struct nfs_page *wb_this_page; /* list of reqs for this page */ + struct nfs_page *wb_head; /* head pointer for req list */ }; struct nfs_pageio_descriptor; struct nfs_pageio_ops { void (*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *); - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + size_t (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, + struct nfs_page *); int (*pg_doio)(struct nfs_pageio_descriptor *); }; +struct nfs_rw_ops { + const fmode_t rw_mode; + struct nfs_rw_header *(*rw_alloc_header)(void); + void (*rw_free_header)(struct nfs_rw_header *); + void (*rw_release)(struct nfs_pgio_data *); + int (*rw_done)(struct rpc_task *, struct nfs_pgio_data *, struct inode *); + void (*rw_result)(struct rpc_task *, struct nfs_pgio_data *); + void (*rw_initiate)(struct nfs_pgio_data *, struct rpc_message *, + struct rpc_task_setup *, int); +}; + struct nfs_pageio_descriptor { struct list_head pg_list; unsigned long pg_bytes_written; @@ -63,6 +82,7 @@ struct nfs_pageio_descriptor { struct inode *pg_inode; const struct nfs_pageio_ops *pg_ops; + const struct nfs_rw_ops *pg_rw_ops; int pg_ioflags; int pg_error; const struct rpc_call_ops *pg_rpc_callops; @@ -75,29 +95,33 @@ struct nfs_pageio_descriptor { #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, - struct inode *inode, struct page *page, + struct nfs_page *last, unsigned int offset, unsigned int count); -extern void nfs_release_request(struct nfs_page *req); +extern void nfs_release_request(struct nfs_page *); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, size_t bsize, int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc); extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t); -extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, +extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req); extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); -extern void nfs_unlock_and_release_request(struct nfs_page *req); +extern void nfs_unlock_and_release_request(struct nfs_page *); +extern void nfs_page_group_lock(struct nfs_page *); +extern void nfs_page_group_unlock(struct nfs_page *); +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); /* * Lock the page of an asynchronous request diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6fb5b2335b59..9a1396e70310 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -489,31 +489,21 @@ struct nfs4_delegreturnres { }; /* - * Arguments to the read call. + * Arguments to the write call. */ -struct nfs_readargs { - struct nfs4_sequence_args seq_args; - struct nfs_fh * fh; - struct nfs_open_context *context; - struct nfs_lock_context *lock_context; - nfs4_stateid stateid; - __u64 offset; - __u32 count; - unsigned int pgbase; - struct page ** pages; +struct nfs_write_verifier { + char data[8]; }; -struct nfs_readres { - struct nfs4_sequence_res seq_res; - struct nfs_fattr * fattr; - __u32 count; - int eof; +struct nfs_writeverf { + struct nfs_write_verifier verifier; + enum nfs3_stable_how committed; }; /* - * Arguments to the write call. + * Arguments shared by the read and write call. */ -struct nfs_writeargs { +struct nfs_pgio_args { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; struct nfs_open_context *context; @@ -521,27 +511,20 @@ struct nfs_writeargs { nfs4_stateid stateid; __u64 offset; __u32 count; - enum nfs3_stable_how stable; unsigned int pgbase; struct page ** pages; - const u32 * bitmask; -}; - -struct nfs_write_verifier { - char data[8]; + const u32 * bitmask; /* used by write */ + enum nfs3_stable_how stable; /* used by write */ }; -struct nfs_writeverf { - struct nfs_write_verifier verifier; - enum nfs3_stable_how committed; -}; - -struct nfs_writeres { +struct nfs_pgio_res { struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; - struct nfs_writeverf * verf; __u32 count; - const struct nfs_server *server; + int eof; /* used by read */ + struct nfs_writeverf * verf; /* used by write */ + const struct nfs_server *server; /* used by write */ + }; /* @@ -1129,6 +1112,7 @@ struct pnfs_commit_bucket { struct list_head committing; struct pnfs_layout_segment *wlseg; struct pnfs_layout_segment *clseg; + struct nfs_writeverf direct_verf; }; struct pnfs_ds_commit_info { @@ -1264,20 +1248,6 @@ struct nfs_page_array { struct page *page_array[NFS_PAGEVEC_SIZE]; }; -struct nfs_read_data { - struct nfs_pgio_header *header; - struct list_head list; - struct rpc_task task; - struct nfs_fattr fattr; /* fattr storage */ - struct nfs_readargs args; - struct nfs_readres res; - unsigned long timestamp; /* For lease renewal */ - int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data); - __u64 mds_offset; - struct nfs_page_array pages; - struct nfs_client *ds_clp; /* pNFS data server */ -}; - /* used as flag bits in nfs_pgio_header */ enum { NFS_IOHDR_ERROR = 0, @@ -1287,19 +1257,22 @@ enum { NFS_IOHDR_NEED_RESCHED, }; +struct nfs_pgio_data; + struct nfs_pgio_header { struct inode *inode; struct rpc_cred *cred; struct list_head pages; - struct list_head rpc_list; + struct nfs_pgio_data *data; atomic_t refcnt; struct nfs_page *req; - struct nfs_writeverf *verf; + struct nfs_writeverf verf; /* Used for writes */ struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; void (*release) (struct nfs_pgio_header *hdr); const struct nfs_pgio_completion_ops *completion_ops; + const struct nfs_rw_ops *rw_ops; struct nfs_direct_req *dreq; void *layout_private; spinlock_t lock; @@ -1310,30 +1283,24 @@ struct nfs_pgio_header { unsigned long flags; }; -struct nfs_read_header { - struct nfs_pgio_header header; - struct nfs_read_data rpc_data; -}; - -struct nfs_write_data { +struct nfs_pgio_data { struct nfs_pgio_header *header; - struct list_head list; struct rpc_task task; struct nfs_fattr fattr; - struct nfs_writeverf verf; - struct nfs_writeargs args; /* argument struct */ - struct nfs_writeres res; /* result struct */ + struct nfs_writeverf verf; /* Used for writes */ + struct nfs_pgio_args args; /* argument struct */ + struct nfs_pgio_res res; /* result struct */ unsigned long timestamp; /* For lease renewal */ - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); + int (*pgio_done_cb) (struct rpc_task *task, struct nfs_pgio_data *data); __u64 mds_offset; /* Filelayout dense stripe */ struct nfs_page_array pages; struct nfs_client *ds_clp; /* pNFS data server */ + int ds_idx; /* ds index if ds_clp is set */ }; -struct nfs_write_header { +struct nfs_rw_header { struct nfs_pgio_header header; - struct nfs_write_data rpc_data; - struct nfs_writeverf verf; + struct nfs_pgio_data rpc_data; }; struct nfs_mds_commit_info { @@ -1465,16 +1432,11 @@ struct nfs_rpc_ops { struct nfs_pathconf *); int (*set_capabilities)(struct nfs_server *, struct nfs_fh *); int (*decode_dirent)(struct xdr_stream *, struct nfs_entry *, int); - void (*read_setup) (struct nfs_read_data *, struct rpc_message *); - void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, - const struct nfs_pgio_completion_ops *); - int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *); - int (*read_done) (struct rpc_task *, struct nfs_read_data *); - void (*write_setup) (struct nfs_write_data *, struct rpc_message *); - void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int, - const struct nfs_pgio_completion_ops *); - int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *); - int (*write_done) (struct rpc_task *, struct nfs_write_data *); + int (*pgio_rpc_prepare)(struct rpc_task *, struct nfs_pgio_data *); + void (*read_setup) (struct nfs_pgio_data *, struct rpc_message *); + int (*read_done) (struct rpc_task *, struct nfs_pgio_data *); + void (*write_setup) (struct nfs_pgio_data *, struct rpc_message *); + int (*write_done) (struct rpc_task *, struct nfs_pgio_data *); void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *); void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *); int (*commit_done) (struct rpc_task *, struct nfs_commit_data *); diff --git a/include/linux/nfsd/debug.h b/include/linux/nfsd/debug.h deleted file mode 100644 index 19ef8375b577..000000000000 --- a/include/linux/nfsd/debug.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * linux/include/linux/nfsd/debug.h - * - * Debugging-related stuff for nfsd - * - * Copyright (C) 1995 Olaf Kirch <okir@monad.swb.de> - */ -#ifndef LINUX_NFSD_DEBUG_H -#define LINUX_NFSD_DEBUG_H - -#include <uapi/linux/nfsd/debug.h> - -# undef ifdebug -# ifdef NFSD_DEBUG -# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) -# else -# define ifdebug(flag) if (0) -# endif -#endif /* LINUX_NFSD_DEBUG_H */ diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h deleted file mode 100644 index a93593f1fa4e..000000000000 --- a/include/linux/nfsd/nfsfh.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * include/linux/nfsd/nfsfh.h - * - * This file describes the layout of the file handles as passed - * over the wire. - * - * Earlier versions of knfsd used to sign file handles using keyed MD5 - * or SHA. I've removed this code, because it doesn't give you more - * security than blocking external access to port 2049 on your firewall. - * - * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> - */ -#ifndef _LINUX_NFSD_FH_H -#define _LINUX_NFSD_FH_H - -# include <linux/sunrpc/svc.h> -#include <uapi/linux/nfsd/nfsfh.h> - -static inline __u32 ino_t_to_u32(ino_t ino) -{ - return (__u32) ino; -} - -static inline ino_t u32_to_ino_t(__u32 uino) -{ - return (ino_t) uino; -} - -/* - * This is the internal representation of an NFS handle used in knfsd. - * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. - */ -typedef struct svc_fh { - struct knfsd_fh fh_handle; /* FH data */ - struct dentry * fh_dentry; /* validated dentry */ - struct svc_export * fh_export; /* export pointer */ - int fh_maxsize; /* max size for fh_handle */ - - unsigned char fh_locked; /* inode locked by us */ - unsigned char fh_want_write; /* remount protection taken */ - -#ifdef CONFIG_NFSD_V3 - unsigned char fh_post_saved; /* post-op attrs saved */ - unsigned char fh_pre_saved; /* pre-op attrs saved */ - - /* Pre-op attributes saved during fh_lock */ - __u64 fh_pre_size; /* size before operation */ - struct timespec fh_pre_mtime; /* mtime before oper */ - struct timespec fh_pre_ctime; /* ctime before oper */ - /* - * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) - * to find out if it is valid. - */ - u64 fh_pre_change; - - /* Post-op attributes saved in fh_unlock */ - struct kstat fh_post_attr; /* full attrs after operation */ - u64 fh_post_change; /* nfsv4 change; see above */ -#endif /* CONFIG_NFSD_V3 */ - -} svc_fh; - -#endif /* _LINUX_NFSD_FH_H */ diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index c29a6dee6bec..88e6ea4a5d36 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -1,23 +1,6 @@ -/* - * OMAP DMA Engine support - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ #ifndef __LINUX_OMAP_DMA_H #define __LINUX_OMAP_DMA_H - -struct dma_chan; - -#if defined(CONFIG_DMA_OMAP) || (defined(CONFIG_DMA_OMAP_MODULE) && defined(MODULE)) -bool omap_dma_filter_fn(struct dma_chan *, void *); -#else -static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d) -{ - return false; -} -#endif +#include <linux/omap-dmaengine.h> /* * Legacy OMAP DMA handling defines and functions diff --git a/include/linux/omap-dmaengine.h b/include/linux/omap-dmaengine.h new file mode 100644 index 000000000000..8e6906c72e90 --- /dev/null +++ b/include/linux/omap-dmaengine.h @@ -0,0 +1,21 @@ +/* + * OMAP DMA Engine support + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __LINUX_OMAP_DMAENGINE_H +#define __LINUX_OMAP_DMAENGINE_H + +struct dma_chan; + +#if defined(CONFIG_DMA_OMAP) || (defined(CONFIG_DMA_OMAP_MODULE) && defined(MODULE)) +bool omap_dma_filter_fn(struct dma_chan *, void *); +#else +static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d) +{ + return false; +} +#endif +#endif /* __LINUX_OMAP_DMAENGINE_H */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 04e763221246..1bc7cd05b22e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -244,6 +244,7 @@ struct svc_rqst { struct page * rq_pages[RPCSVC_MAXPAGES]; struct page * *rq_respages; /* points into rq_pages */ struct page * *rq_next_page; /* next reply page to use */ + struct page * *rq_page_end; /* one past the last page */ struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ @@ -254,11 +255,15 @@ struct svc_rqst { u32 rq_prot; /* IP protocol */ unsigned short rq_secure : 1; /* secure port */ + unsigned short rq_local : 1; /* local request */ void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ void * rq_auth_data; /* flavor-specific data */ - + int rq_auth_slack; /* extra space xdr code + * should leave in head + * for krb5i, krb5p. + */ int rq_reserved; /* space on socket outq * reserved for this request */ @@ -454,11 +459,7 @@ char * svc_print_addr(struct svc_rqst *, char *, size_t); */ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space) { - int added_space = 0; - - if (rqstp->rq_authop->flavour) - added_space = RPC_MAX_AUTH_SIZE; - svc_reserve(rqstp, space + added_space); + svc_reserve(rqstp, space + rqstp->rq_auth_slack); } #endif /* SUNRPC_SVC_H */ diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 0b8e3e6bdacf..5cf99a016368 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -115,14 +115,13 @@ struct svc_rdma_fastreg_mr { struct list_head frmr_list; }; struct svc_rdma_req_map { - struct svc_rdma_fastreg_mr *frmr; unsigned long count; union { struct kvec sge[RPCSVC_MAXPAGES]; struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; + unsigned long lkey[RPCSVC_MAXPAGES]; }; }; -#define RDMACTXT_F_FAST_UNREG 1 #define RDMACTXT_F_LAST_CTXT 2 #define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index b05963f09ebf..7235040a19b2 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -24,6 +24,7 @@ struct svc_xprt_ops { void (*xpo_release_rqst)(struct svc_rqst *); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); + int (*xpo_secure_port)(struct svc_rqst *); }; struct svc_xprt_class { @@ -63,6 +64,7 @@ struct svc_xprt { #define XPT_DETACHED 10 /* detached from tempsocks list */ #define XPT_LISTENER 11 /* listening endpoint */ #define XPT_CACHE_AUTH 12 /* cache auth info */ +#define XPT_LOCAL 13 /* connection from loopback interface */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 15f9204ee70b..70c6b92e15a7 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -215,6 +215,9 @@ typedef int (*kxdrdproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj); extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); +extern void xdr_commit_encode(struct xdr_stream *xdr); +extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); +extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3876f0f1dfd3..fcbfe8783243 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -24,6 +24,12 @@ #define RPC_MAX_SLOT_TABLE_LIMIT (65536U) #define RPC_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE_LIMIT +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) + /* * This describes a timeout strategy */ diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h index 616e3b396476..20391235d088 100644 --- a/include/uapi/linux/nfsd/nfsfh.h +++ b/include/uapi/linux/nfsd/nfsfh.h @@ -1,13 +1,7 @@ /* - * include/linux/nfsd/nfsfh.h - * * This file describes the layout of the file handles as passed * over the wire. * - * Earlier versions of knfsd used to sign file handles using keyed MD5 - * or SHA. I've removed this code, because it doesn't give you more - * security than blocking external access to port 2049 on your firewall. - * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ @@ -37,7 +31,7 @@ struct nfs_fhbase_old { }; /* - * This is the new flexible, extensible style NFSv2/v3 file handle. + * This is the new flexible, extensible style NFSv2/v3/v4 file handle. * by Neil Brown <neilb@cse.unsw.edu.au> - March 2000 * * The file handle starts with a sequence of four-byte words. @@ -47,14 +41,7 @@ struct nfs_fhbase_old { * * All four-byte values are in host-byte-order. * - * The auth_type field specifies how the filehandle can be authenticated - * This might allow a file to be confirmed to be in a writable part of a - * filetree without checking the path from it up to the root. - * Current values: - * 0 - No authentication. fb_auth is 0 bytes long - * Possible future values: - * 1 - 4 bytes taken from MD5 hash of the remainer of the file handle - * prefixed by a secret and with the important export flags. + * The auth_type field is deprecated and must be set to 0. * * The fsid_type identifies how the filesystem (or export point) is * encoded. @@ -71,14 +58,9 @@ struct nfs_fhbase_old { * 7 - 8 byte inode number and 16 byte uuid * * The fileid_type identified how the file within the filesystem is encoded. - * This is (will be) passed to, and set by, the underlying filesystem if it supports - * filehandle operations. The filesystem must not use the value '0' or '0xff' and may - * only use the values 1 and 2 as defined below: - * Current values: - * 0 - The root, or export point, of the filesystem. fb_fileid is 0 bytes. - * 1 - 32bit inode number, 32 bit generation number. - * 2 - 32bit inode number, 32 bit generation number, 32 bit parent directory inode number. - * + * The values for this field are filesystem specific, exccept that + * filesystems must not use the values '0' or '0xff'. 'See enum fid_type' + * in include/linux/exportfs.h for currently registered values. */ struct nfs_fhbase_new { __u8 fb_version; /* == 1, even => nfs_fhbase_old */ @@ -114,9 +96,9 @@ struct knfsd_fh { #define fh_fsid_type fh_base.fh_new.fb_fsid_type #define fh_auth_type fh_base.fh_new.fb_auth_type #define fh_fileid_type fh_base.fh_new.fb_fileid_type -#define fh_auth fh_base.fh_new.fb_auth #define fh_fsid fh_base.fh_new.fb_auth - +/* Do not use, provided for userspace compatiblity. */ +#define fh_auth fh_base.fh_new.fb_auth #endif /* _UAPI_LINUX_NFSD_FH_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 84b2bbf443e7..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -424,23 +424,19 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace. */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 27ce26240932..92d5ab99fbf3 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -218,10 +218,8 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) spin_lock(®istered_mechs_lock); list_for_each_entry(pos, ®istered_mechs, gm_list) { - if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { - module_put(pos->gm_owner); + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; - } if (try_module_get(pos->gm_owner)) gm = pos; break; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 0f73f4507746..4ce5eccec1f6 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1503,6 +1503,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_integ_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE; break; case RPC_GSS_SVC_PRIVACY: /* placeholders for length and seq. number: */ @@ -1511,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if (unwrap_priv_data(rqstp, &rqstp->rq_arg, gc->gc_seq, rsci->mechctx)) goto garbage_args; + rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2; break; default: goto auth_err; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index ae333c1845bb..066362141133 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -374,7 +374,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) } return; out: - printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); + printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 25578afe1548..c0365c14b858 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -832,7 +832,8 @@ static void rpc_async_schedule(struct work_struct *work) * @size: requested byte size * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL if the request cannot be serviced immediately. + * returning NULL and suppressing warning if the request cannot be serviced + * immediately. * The caller can arrange to sleep in a way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a @@ -845,7 +846,7 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) gfp |= __GFP_MEMALLOC; diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index 14c9f6d1c5ff..f2b7cb540e61 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task) (task->tk_msg.rpc_proc->p_decode != NULL); } +static inline int sock_is_loopback(struct sock *sk) +{ + struct dst_entry *dst; + int loopback = 0; + rcu_read_lock(); + dst = rcu_dereference(sk->sk_dst_cache); + if (dst && dst->dev && + (dst->dev->features & NETIF_F_LOOPBACK)) + loopback = 1; + rcu_read_unlock(); + return loopback; +} + int svc_send_common(struct socket *sock, struct xdr_buf *xdr, struct page *headpage, unsigned long headoffset, struct page *tailpage, unsigned long tailoffset); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 06c6ff0cb911..b4737fbdec13 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -597,6 +597,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) } rqstp->rq_pages[i] = p; } + rqstp->rq_page_end = &rqstp->rq_pages[i]; rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ /* Make arg->head point to first page and arg->pages point to rest */ @@ -730,6 +731,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) newxpt = xprt->xpt_ops->xpo_accept(xprt); if (newxpt) svc_add_new_temp_xprt(serv, newxpt); + else + module_put(xprt->xpt_class->xcl_owner); } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", @@ -793,7 +796,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) clear_bit(XPT_OLD, &xprt->xpt_flags); - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp); rqstp->rq_chandle.defer = svc_defer; if (serv->sv_stats) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 2af7b0cba43a..79c0f3459b5c 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) } spin_unlock(&authtab_lock); + rqstp->rq_auth_slack = 0; + rqstp->rq_authop = aops; return aops->accept(rqstp, authp); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 43bcb4699d69..b507cd327d9b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -400,6 +400,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, release_sock(sock->sk); #endif } + +static int svc_sock_secure_port(struct svc_rqst *rqstp) +{ + return svc_port_is_privileged(svc_addr(rqstp)); +} + /* * INET callback when data has been received on the socket. */ @@ -678,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_udp_class = { @@ -842,8 +849,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) * tell us anything. For now just warn about unpriv connections. */ if (!svc_port_is_privileged(sin)) { - dprintk(KERN_WARNING - "%s: connect from unprivileged port: %s\n", + dprintk("%s: connect from unprivileged port: %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); } @@ -867,6 +873,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); + if (sock_is_loopback(newsock->sk)) + set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); + else + clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags); if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1112,6 +1122,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; + rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; @@ -1234,6 +1245,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = { .xpo_detach = svc_bc_tcp_sock_detach, .xpo_free = svc_bc_sock_free, .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_bc_class = { @@ -1272,6 +1284,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, + .xpo_secure_port = svc_sock_secure_port, }; static struct svc_xprt_class svc_tcp_class = { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index dd97ba3c4456..23fb4e75e245 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -462,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; + xdr_set_scratch_buffer(xdr, NULL, 0); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -482,6 +483,73 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_encode); /** + * xdr_commit_encode - Ensure all data is written to buffer + * @xdr: pointer to xdr_stream + * + * We handle encoding across page boundaries by giving the caller a + * temporary location to write to, then later copying the data into + * place; xdr_commit_encode does that copying. + * + * Normally the caller doesn't need to call this directly, as the + * following xdr_reserve_space will do it. But an explicit call may be + * required at the end of encoding, or any other time when the xdr_buf + * data might be read. + */ +void xdr_commit_encode(struct xdr_stream *xdr) +{ + int shift = xdr->scratch.iov_len; + void *page; + + if (shift == 0) + return; + page = page_address(*xdr->page_ptr); + memcpy(xdr->scratch.iov_base, page, shift); + memmove(page, page + shift, (void *)xdr->p - page); + xdr->scratch.iov_len = 0; +} +EXPORT_SYMBOL_GPL(xdr_commit_encode); + +__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) +{ + static __be32 *p; + int space_left; + int frag1bytes, frag2bytes; + + if (nbytes > PAGE_SIZE) + return NULL; /* Bigger buffers require special handling */ + if (xdr->buf->len + nbytes > xdr->buf->buflen) + return NULL; /* Sorry, we're totally out of space */ + frag1bytes = (xdr->end - xdr->p) << 2; + frag2bytes = nbytes - frag1bytes; + if (xdr->iov) + xdr->iov->iov_len += frag1bytes; + else + xdr->buf->page_len += frag1bytes; + xdr->page_ptr++; + xdr->iov = NULL; + /* + * If the last encode didn't end exactly on a page boundary, the + * next one will straddle boundaries. Encode into the next + * page, then copy it back later in xdr_commit_encode. We use + * the "scratch" iov to track any temporarily unused fragment of + * space at the end of the previous buffer: + */ + xdr->scratch.iov_base = xdr->p; + xdr->scratch.iov_len = frag1bytes; + p = page_address(*xdr->page_ptr); + /* + * Note this is where the next encode will start after we've + * shifted this one back: + */ + xdr->p = (void *)p + frag2bytes; + space_left = xdr->buf->buflen - xdr->buf->len; + xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE); + xdr->buf->page_len += frag2bytes; + xdr->buf->len += nbytes; + return p; +} + +/** * xdr_reserve_space - Reserve buffer space for sending * @xdr: pointer to xdr_stream * @nbytes: number of bytes to reserve @@ -495,20 +563,122 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) __be32 *p = xdr->p; __be32 *q; + xdr_commit_encode(xdr); /* align nbytes on the next 32-bit boundary */ nbytes += 3; nbytes &= ~3; q = p + (nbytes >> 2); if (unlikely(q > xdr->end || q < p)) - return NULL; + return xdr_get_next_encode_buffer(xdr, nbytes); xdr->p = q; - xdr->iov->iov_len += nbytes; + if (xdr->iov) + xdr->iov->iov_len += nbytes; + else + xdr->buf->page_len += nbytes; xdr->buf->len += nbytes; return p; } EXPORT_SYMBOL_GPL(xdr_reserve_space); /** + * xdr_truncate_encode - truncate an encode buffer + * @xdr: pointer to xdr_stream + * @len: new length of buffer + * + * Truncates the xdr stream, so that xdr->buf->len == len, + * and xdr->p points at offset len from the start of the buffer, and + * head, tail, and page lengths are adjusted to correspond. + * + * If this means moving xdr->p to a different buffer, we assume that + * that the end pointer should be set to the end of the current page, + * except in the case of the head buffer when we assume the head + * buffer's current length represents the end of the available buffer. + * + * This is *not* safe to use on a buffer that already has inlined page + * cache pages (as in a zero-copy server read reply), except for the + * simple case of truncating from one position in the tail to another. + * + */ +void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + int fraglen; + int new, old; + + if (len > buf->len) { + WARN_ON_ONCE(1); + return; + } + xdr_commit_encode(xdr); + + fraglen = min_t(int, buf->len - len, tail->iov_len); + tail->iov_len -= fraglen; + buf->len -= fraglen; + if (tail->iov_len && buf->len == len) { + xdr->p = tail->iov_base + tail->iov_len; + /* xdr->end, xdr->iov should be set already */ + return; + } + WARN_ON_ONCE(fraglen); + fraglen = min_t(int, buf->len - len, buf->page_len); + buf->page_len -= fraglen; + buf->len -= fraglen; + + new = buf->page_base + buf->page_len; + old = new + fraglen; + xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT); + + if (buf->page_len && buf->len == len) { + xdr->p = page_address(*xdr->page_ptr); + xdr->end = (void *)xdr->p + PAGE_SIZE; + xdr->p = (void *)xdr->p + (new % PAGE_SIZE); + /* xdr->iov should already be NULL */ + return; + } + if (fraglen) { + xdr->end = head->iov_base + head->iov_len; + xdr->page_ptr--; + } + /* (otherwise assume xdr->end is already set) */ + head->iov_len = len; + buf->len = len; + xdr->p = head->iov_base + head->iov_len; + xdr->iov = buf->head; +} +EXPORT_SYMBOL(xdr_truncate_encode); + +/** + * xdr_restrict_buflen - decrease available buffer space + * @xdr: pointer to xdr_stream + * @newbuflen: new maximum number of bytes available + * + * Adjust our idea of how much space is available in the buffer. + * If we've already used too much space in the buffer, returns -1. + * If the available space is already smaller than newbuflen, returns 0 + * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen + * and ensures xdr->end is set at most offset newbuflen from the start + * of the buffer. + */ +int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen) +{ + struct xdr_buf *buf = xdr->buf; + int left_in_this_buf = (void *)xdr->end - (void *)xdr->p; + int end_offset = buf->len + left_in_this_buf; + + if (newbuflen < 0 || newbuflen < buf->len) + return -1; + if (newbuflen > buf->buflen) + return 0; + if (newbuflen < end_offset) + xdr->end = (void *)xdr->end + newbuflen - end_offset; + buf->buflen = newbuflen; + return 0; +} +EXPORT_SYMBOL(xdr_restrict_buflen); + +/** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream * @pages: list of pages diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 89d051de6b3e..c3b2b3369e52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); -/* - * The transport code maintains an estimate on the maximum number of out- - * standing RPC requests, using a smoothed version of the congestion - * avoidance implemented in 44BSD. This is basically the Van Jacobson - * congestion algorithm: If a retransmit occurs, the congestion window is - * halved; otherwise, it is incremented by 1/cwnd when - * - * - a reply is received and - * - a full number of requests are outstanding and - * - the congestion window hasn't been updated recently. - */ -#define RPC_CWNDSHIFT (8U) -#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) -#define RPC_INITCWND RPC_CWNDSCALE -#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) - -#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) - /** * xprt_register_transport - register a transport implementation * @transport: transport to register @@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * - * We use a time-smoothed congestion estimator to avoid heavy oscillation. + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. */ void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead526b125..693966d3f33b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -78,8 +78,7 @@ static const char transfertypes[][12] = { * elements. Segments are then coalesced when registered, if possible * within the selected memreg mode. * - * Note, this routine is never called if the connection's memory - * registration strategy is 0 (bounce buffers). + * Returns positive number of segments converted, or a negative errno. */ static int @@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; while (len && n < nsegs) { + if (!ppages[p]) { + /* alloc the pagelist for receiving buffer */ + ppages[p] = alloc_page(GFP_ATOMIC); + if (!ppages[p]) + return -ENOMEM; + } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - BUG_ON(seg[n].mr_len > PAGE_SIZE); + if (seg[n].mr_len > PAGE_SIZE) + return -EIO; len -= seg[n].mr_len; ++n; ++p; @@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, /* Message overflows the seg array */ if (len && n == nsegs) - return 0; + return -EIO; if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing @@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; if (n == nsegs) /* Tail remains, but we're out of segments */ - return 0; + return -EIO; seg[n].mr_page = NULL; seg[n].mr_offset = xdrbuf->tail[0].iov_base; seg[n].mr_len = xdrbuf->tail[0].iov_len; @@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * Reply chunk (a counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns positive RPC/RDMA header size, or negative errno. */ -static unsigned int +static ssize_t rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int nsegs, nchunks = 0; + int n, nsegs, nchunks = 0; unsigned int pos; struct rpcrdma_mr_seg *seg = req->rl_segments; struct rpcrdma_read_chunk *cur_rchunk = NULL; @@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, pos = target->head[0].iov_len; nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); - if (nsegs == 0) - return 0; + if (nsegs < 0) + return nsegs; do { - /* bind/register the memory, then build chunk from result. */ - int n = rpcrdma_register_external(seg, nsegs, + n = rpcrdma_register_external(seg, nsegs, cur_wchunk != NULL, r_xprt); if (n <= 0) goto out; @@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, /* success. all failures return above */ req->rl_nchunks = nchunks; - BUG_ON(nchunks == 0); - BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) - && (nchunks > 3)); - /* * finish off header. If write, marshal discrim and nchunks. */ @@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, out: for (pos = 0; nchunks--;) pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt, NULL); - return 0; + &req->rl_segments[pos], r_xprt); + return n; } /* @@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. * [2] -- optional padding. * [3] -- if padded, header only in [1] and data here. + * + * Returns zero on success, otherwise a negative errno. */ int @@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t hdrlen, rpclen, padlen; + size_t rpclen, padlen; + ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) wtype = rpcrdma_noch; - BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); - - if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && - (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { - /* forced to "pure inline"? */ - dprintk("RPC: %s: too much data (%d/%d) for inline\n", - __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); - return -1; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + dprintk("RPC: %s: cannot marshal multiple chunk lists\n", + __func__); + return -EIO; } hdrlen = 28; /*sizeof *headerp;*/ @@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - BUG_ON(wtype != rpcrdma_noch); - + if (wtype != rpcrdma_noch) { + dprintk("RPC: %s: invalid chunk list\n", + __func__); + return -EIO; + } } else { headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; @@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch && - r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + if (wtype == rpcrdma_noch) wtype = rpcrdma_replych; } } @@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, headerp, wtype); } - - if (hdrlen == 0) - return -1; + if (hdrlen < 0) + return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", @@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf = rqst->rq_rcv_buf; } -/* - * This function is called when an async event is posted to - * the connection which changes the connection state. All it - * does at this point is mark the connection up/down, the rpc - * timers do the rest. - */ void -rpcrdma_conn_func(struct rpcrdma_ep *ep) +rpcrdma_connect_worker(struct work_struct *work) { + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); @@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) } /* - * This function is called when memory window unbind which we are waiting - * for completes. Just use rr_func (zeroed by upcall) to signal completion. + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. */ -static void -rpcrdma_unbind_func(struct rpcrdma_rep *rep) +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) { - wake_up(&rep->rr_unbind); + schedule_delayed_work(&ep->rep_connect_worker, 0); } /* @@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpc_xprt *xprt = rep->rr_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; - int i, rdmalen, status; + int rdmalen, status; + unsigned long cwnd; /* Check status. If bad, signal disconnect and return rep to pool */ if (rep->rr_len == ~0U) { @@ -783,6 +785,7 @@ repost: /* from here on, the reply is no longer an orphan */ req->rl_reply = rep; + xprt->reestablish_timeout = 0; /* check for expected message types */ /* The order of some of these tests is important. */ @@ -857,26 +860,10 @@ badheader: break; } - /* If using mw bind, start the deregister process now. */ - /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ - if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS: - for (i = 0; req->rl_nchunks-- > 1;) - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); - /* Optionally wait (not here) for unbinds to complete */ - rep->rr_func = rpcrdma_unbind_func; - (void) rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, rep); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - for (i = 0; req->rl_nchunks--;) - i += rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, NULL); - break; - default: - break; - } + cwnd = xprt->cwnd; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 8d904e4eef15..8f92a61ee2df 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* Set up the XDR head */ rqstp->rq_arg.head[0].iov_base = page_address(page); - rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.head[0].iov_len = + min_t(size_t, byte_count, ctxt->sge[0].length); rqstp->rq_arg.len = byte_count; rqstp->rq_arg.buflen = byte_count; @@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; - bc -= min(bc, ctxt->sge[sge_no].length); + bc -= min_t(u32, bc, ctxt->sge[sge_no].length); rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; sge_no++; } @@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -/* Encode a read-chunk-list as an array of IB SGE - * - * Assumptions: - * - chunk[0]->position points to pages[0] at an offset of 0 - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - * - */ -static int map_read_chunks(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { - int sge_no; - int sge_bytes; - int page_off; - int page_no; - int ch_bytes; - int ch_no; - struct rpcrdma_read_chunk *ch; + if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == + RDMA_TRANSPORT_IWARP) + return 1; + else + return min_t(int, sge_count, xprt->sc_max_sge); +} - sge_no = 0; - page_no = 0; - page_off = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch_no = 0; - ch_bytes = ntohl(ch->rc_target.rs_length); - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = ch_bytes; - head->arg.len = rqstp->rq_arg.len + ch_bytes; - head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; - head->count++; - chl_map->ch[0].start = 0; - while (byte_count) { - rpl_map->sge[sge_no].iov_base = - page_address(rqstp->rq_arg.pages[page_no]) + page_off; - sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); - rpl_map->sge[sge_no].iov_len = sge_bytes; - /* - * Don't bump head->count here because the same page - * may be used by multiple SGE. - */ - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; +typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last); + +/* Issue an RDMA_READ using the local lkey to map the data sink */ +static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) +{ + struct ib_send_wr read_wr; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; + + ctxt->direction = DMA_FROM_DEVICE; + ctxt->read_hdr = head; + pages_needed = + min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; rqstp->rq_next_page = rqstp->rq_respages + 1; + ctxt->sge[pno].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], pg_off, + PAGE_SIZE - pg_off, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[pno].addr); + if (ret) + goto err; + atomic_inc(&xprt->sc_dma_used); - byte_count -= sge_bytes; - ch_bytes -= sge_bytes; - sge_no++; - /* - * If all bytes for this chunk have been mapped to an - * SGE, move to the next SGE - */ - if (ch_bytes == 0) { - chl_map->ch[ch_no].count = - sge_no - chl_map->ch[ch_no].start; - ch_no++; - ch++; - chl_map->ch[ch_no].start = sge_no; - ch_bytes = ntohl(ch->rc_target.rs_length); - /* If bytes remaining account for next chunk */ - if (byte_count) { - head->arg.page_len += ch_bytes; - head->arg.len += ch_bytes; - head->arg.buflen += ch_bytes; - } + /* The lkey here is either a local dma lkey or a dma_mr lkey */ + ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].length = len; + ctxt->count++; + + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; } - /* - * If this SGE consumed all of the page, move to the - * next page - */ - if ((sge_bytes + page_off) == PAGE_SIZE) { - page_no++; - page_off = 0; - /* - * If there are still bytes left to map, bump - * the page count - */ - if (byte_count) - head->count++; - } else - page_off += sge_bytes; + rs_length -= len; } - BUG_ON(byte_count != 0); - return sge_no; + + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = pages_needed; + + ret = svc_rdma_send(xprt, &read_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; + } + + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + return ret; } -/* Map a read-chunk-list to an XDR and fast register the page-list. - * - * Assumptions: - * - chunk[0] position points to pages[0] at an offset of 0 - * - pages[] will be made physically contiguous by creating a one-off memory - * region using the fastreg verb. - * - byte_count is # of bytes in read-chunk-list - * - ch_count is # of chunks in read-chunk-list - * - * Output: - * - sge array pointing into pages[] array. - * - chunk_sge array specifying sge index and count for each - * chunk in the read list - */ -static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, +/* Issue an RDMA_READ using an FRMR to map the data sink */ +static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, - struct rpcrdma_msg *rmsgp, - struct svc_rdma_req_map *rpl_map, - struct svc_rdma_req_map *chl_map, - int ch_count, - int byte_count) + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) { - int page_no; - int ch_no; - u32 offset; - struct rpcrdma_read_chunk *ch; - struct svc_rdma_fastreg_mr *frmr; - int ret = 0; + struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; + struct ib_send_wr fastreg_wr; + u8 key; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; - frmr = svc_rdma_get_frmr(xprt); if (IS_ERR(frmr)) return -ENOMEM; - head->frmr = frmr; - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = &head->pages[head->count]; - head->hdr_count = head->count; /* save count of hdr pages */ - head->arg.page_base = 0; - head->arg.page_len = byte_count; - head->arg.len = rqstp->rq_arg.len + byte_count; - head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = frmr; + pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); - /* Fast register the page list */ - frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); frmr->direction = DMA_FROM_DEVICE; frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->map_len = byte_count; - frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; - for (page_no = 0; page_no < frmr->page_list_len; page_no++) { - frmr->page_list->page_list[page_no] = + frmr->map_len = pages_needed << PAGE_SHIFT; + frmr->page_list_len = pages_needed; + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + frmr->page_list->page_list[pno] = ib_dma_map_page(xprt->sc_cm_id->device, - rqstp->rq_arg.pages[page_no], 0, + head->arg.pages[pg_no], 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[pno]); + if (ret) + goto err; atomic_inc(&xprt->sc_dma_used); - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - } - head->count += page_no; - - /* rq_respages points one past arg pages */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - /* Create the reply and chunk maps */ - offset = 0; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - for (ch_no = 0; ch_no < ch_count; ch_no++) { - int len = ntohl(ch->rc_target.rs_length); - rpl_map->sge[ch_no].iov_base = frmr->kva + offset; - rpl_map->sge[ch_no].iov_len = len; - chl_map->ch[ch_no].count = 1; - chl_map->ch[ch_no].start = ch_no; - offset += len; - ch++; + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; + } + rs_length -= len; } - ret = svc_rdma_fastreg(xprt, frmr); - if (ret) - goto fatal_err; - - return ch_no; - - fatal_err: - printk("svcrdma: error fast registering xdr for xprt %p", xprt); - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - -static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, - struct svc_rdma_op_ctxt *ctxt, - struct svc_rdma_fastreg_mr *frmr, - struct kvec *vec, - u64 *sgl_offset, - int count) -{ - int i; - unsigned long off; + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->count = count; - ctxt->direction = DMA_FROM_DEVICE; - for (i = 0; i < count; i++) { - ctxt->sge[i].length = 0; /* in case map fails */ - if (!frmr) { - BUG_ON(!virt_to_page(vec[i].iov_base)); - off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; - ctxt->sge[i].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(vec[i].iov_base), - off, - vec[i].iov_len, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - ctxt->sge[i].addr)) - return -EINVAL; - ctxt->sge[i].lkey = xprt->sc_dma_lkey; - atomic_inc(&xprt->sc_dma_used); - } else { - ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; - ctxt->sge[i].lkey = frmr->mr->lkey; - } - ctxt->sge[i].length = vec[i].iov_len; - *sgl_offset = *sgl_offset + vec[i].iov_len; + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; + ctxt->sge[0].lkey = frmr->mr->lkey; + ctxt->sge[0].length = read; + ctxt->count = 1; + ctxt->read_hdr = head; + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + fastreg_wr.next = &read_wr; + + /* Prepare RDMA_READ */ + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = 1; + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; + } else { + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.next = &inv_wr; + /* Prepare invalidate */ + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = (unsigned long)ctxt; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; + inv_wr.ex.invalidate_rkey = frmr->mr->lkey; + } + ctxt->wr_op = read_wr.opcode; + + /* Post the chain */ + ret = svc_rdma_send(xprt, &fastreg_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; } - return 0; -} -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == - RDMA_TRANSPORT_IWARP) && - sge_count > 1) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + svc_rdma_put_frmr(xprt, frmr); + return ret; } -/* - * Use RDMA_READ to read data from the advertised client buffer into the - * XDR stream starting at rq_arg.head[0].iov_base. - * Each chunk in the array - * contains the following fields: - * discrim - '1', This isn't used for data placement - * position - The xdr stream offset (the same for every chunk) - * handle - RMR for client memory region - * length - data transfer length - * offset - 64 bit tagged offset in remote memory region - * - * On our side, we need to read into a pagelist. The first page immediately - * follows the RPC header. - * - * This function returns: - * 0 - No error and no read-list found. - * - * 1 - Successful read-list processing. The data is not yet in - * the pagelist and therefore the RPC request must be deferred. The - * I/O completion will enqueue the transport again and - * svc_rdma_recvfrom will complete the request. - * - * <0 - Error processing/posting read-list. - * - * NOTE: The ctxt must not be touched after the last WR has been posted - * because the I/O completion processing may occur on another - * processor and free / modify the context. Ne touche pas! - */ -static int rdma_read_xdr(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *hdr_ctxt) +static int rdma_read_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) { - struct ib_send_wr read_wr; - struct ib_send_wr inv_wr; - int err = 0; - int ch_no; - int ch_count; - int byte_count; - int sge_count; - u64 sgl_offset; + int page_no, ch_count, ret; struct rpcrdma_read_chunk *ch; - struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_req_map *rpl_map; - struct svc_rdma_req_map *chl_map; + u32 page_offset, byte_count; + u64 rs_offset; + rdma_reader_fn reader; /* If no read list is present, return 0 */ ch = svc_rdma_get_read_chunk(rmsgp); @@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, if (ch_count > RPCSVC_MAXPAGES) return -EINVAL; - /* Allocate temporary reply and chunk maps */ - rpl_map = svc_rdma_get_req_map(); - chl_map = svc_rdma_get_req_map(); + /* The request is completed when the RDMA_READs complete. The + * head context keeps all the pages that comprise the + * request. + */ + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; - if (!xprt->sc_frmr_pg_list_len) - sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); + /* Use FRMR if supported */ + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) + reader = rdma_read_chunk_frmr; else - sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, - rpl_map, chl_map, ch_count, - byte_count); - if (sge_count < 0) { - err = -EIO; - goto out; - } - - sgl_offset = 0; - ch_no = 0; + reader = rdma_read_chunk_lcl; + page_no = 0; page_offset = 0; for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - ch->rc_discrim != 0; ch++, ch_no++) { - u64 rs_offset; -next_sge: - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_FROM_DEVICE; - ctxt->frmr = hdr_ctxt->frmr; - ctxt->read_hdr = NULL; - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + ch->rc_discrim != 0; ch++) { - /* Prepare READ WR */ - memset(&read_wr, 0, sizeof read_wr); - read_wr.wr_id = (unsigned long)ctxt; - read_wr.opcode = IB_WR_RDMA_READ; - ctxt->wr_op = read_wr.opcode; - read_wr.send_flags = IB_SEND_SIGNALED; - read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle); xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, &rs_offset); - read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset; - read_wr.sg_list = ctxt->sge; - read_wr.num_sge = - rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); - err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, - &rpl_map->sge[chl_map->ch[ch_no].start], - &sgl_offset, - read_wr.num_sge); - if (err) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; - } - if (((ch+1)->rc_discrim == 0) && - (read_wr.num_sge == chl_map->ch[ch_no].count)) { - /* - * Mark the last RDMA_READ with a bit to - * indicate all RPC data has been fetched from - * the client and the RPC needs to be enqueued. - */ - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - if (hdr_ctxt->frmr) { - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - /* - * Invalidate the local MR used to map the data - * sink. - */ - if (xprt->sc_dev_caps & - SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.opcode = - IB_WR_RDMA_READ_WITH_INV; - ctxt->wr_op = read_wr.opcode; - read_wr.ex.invalidate_rkey = - ctxt->frmr->mr->lkey; - } else { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - hdr_ctxt->frmr->mr->lkey; - read_wr.next = &inv_wr; - } - } - ctxt->read_hdr = hdr_ctxt; - } - /* Post the read */ - err = svc_rdma_send(xprt, &read_wr); - if (err) { - printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", - err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - goto out; + byte_count = ntohl(ch->rc_target.rs_length); + + while (byte_count > 0) { + ret = reader(xprt, rqstp, head, + &page_no, &page_offset, + ntohl(ch->rc_target.rs_handle), + byte_count, rs_offset, + ((ch+1)->rc_discrim == 0) /* last */ + ); + if (ret < 0) + goto err; + byte_count -= ret; + rs_offset += ret; + head->arg.buflen += ret; } - atomic_inc(&rdma_stat_read); - - if (read_wr.num_sge < chl_map->ch[ch_no].count) { - chl_map->ch[ch_no].count -= read_wr.num_sge; - chl_map->ch[ch_no].start += read_wr.num_sge; - goto next_sge; - } - sgl_offset = 0; - err = 1; } - - out: - svc_rdma_put_req_map(rpl_map); - svc_rdma_put_req_map(chl_map); - + ret = 1; + err: /* Detach arg pages. svc_recv will replenish them */ - for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) - rqstp->rq_pages[ch_no] = NULL; + for (page_no = 0; + &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) + rqstp->rq_pages[page_no] = NULL; - return err; + return ret; } static int rdma_read_complete(struct svc_rqst *rqstp, @@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt, dto_q); list_del_init(&ctxt->dto_q); - } - if (ctxt) { spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); return rdma_read_complete(rqstp, ctxt); - } - - if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, struct svc_rdma_op_ctxt, dto_q); @@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) goto close_out; - BUG_ON(ret); goto out; } dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", @@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) } /* Read read-list data. */ - ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { /* read-list posted, defer until data received from client. */ goto defer; - } - if (ret < 0) { + } else if (ret < 0) { /* Post of read-list failed, free context. */ svc_rdma_put_context(ctxt, 1); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7e024a51617e..49fd21a5c215 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -49,152 +50,6 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* Encode an XDR as an array of IB SGE - * - * Assumptions: - * - head[0] is physically contiguous. - * - tail[0] is physically contiguous. - * - pages[] is not physically or virtually contiguous and consists of - * PAGE_SIZE elements. - * - * Output: - * SGE[0] reserved for RCPRDMA header - * SGE[1] data from xdr->head[] - * SGE[2..sge_count-2] data from xdr->pages[] - * SGE[sge_count-1] data from xdr->tail. - * - * The max SGE we need is the length of the XDR / pagesize + one for - * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES - * reserves a page for both the request and the reply header, and this - * array is only concerned with the reply we are assured that we have - * on extra page for the RPCRMDA header. - */ -static int fast_reg_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) -{ - int sge_no; - u32 sge_bytes; - u32 page_bytes; - u32 page_off; - int page_no = 0; - u8 *frva; - struct svc_rdma_fastreg_mr *frmr; - - frmr = svc_rdma_get_frmr(xprt); - if (IS_ERR(frmr)) - return -ENOMEM; - vec->frmr = frmr; - - /* Skip the RPCRDMA header */ - sge_no = 1; - - /* Map the head. */ - frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); - vec->sge[sge_no].iov_base = xdr->head[0].iov_base; - vec->sge[sge_no].iov_len = xdr->head[0].iov_len; - vec->count = 2; - sge_no++; - - /* Map the XDR head */ - frmr->kva = frva; - frmr->direction = DMA_TO_DEVICE; - frmr->access_flags = 0; - frmr->map_len = PAGE_SIZE; - frmr->page_list_len = 1; - page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - virt_to_page(xdr->head[0].iov_base), - page_off, - PAGE_SIZE - page_off, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - - /* Map the XDR page list */ - page_off = xdr->page_base; - page_bytes = xdr->page_len + page_off; - if (!page_bytes) - goto encode_tail; - - /* Map the pages */ - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - vec->sge[sge_no].iov_len = page_bytes; - sge_no++; - while (page_bytes) { - struct page *page; - - page = xdr->pages[page_no++]; - sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); - page_bytes -= sge_bytes; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, - page, page_off, - sge_bytes, DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - - atomic_inc(&xprt->sc_dma_used); - page_off = 0; /* reset for next time through loop */ - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - vec->count++; - - encode_tail: - /* Map tail */ - if (0 == xdr->tail[0].iov_len) - goto done; - - vec->count++; - vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; - - if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == - ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { - /* - * If head and tail use the same page, we don't need - * to map it again. - */ - vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; - } else { - void *va; - - /* Map another page for the tail */ - page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; - va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); - vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; - - frmr->page_list->page_list[page_no] = - ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), - page_off, - PAGE_SIZE, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - frmr->page_list->page_list[page_no])) - goto fatal_err; - atomic_inc(&xprt->sc_dma_used); - frmr->map_len += PAGE_SIZE; - frmr->page_list_len++; - } - - done: - if (svc_rdma_fastreg(xprt, frmr)) - goto fatal_err; - - return 0; - - fatal_err: - printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); - vec->frmr = NULL; - svc_rdma_put_frmr(xprt, frmr); - return -EIO; -} - static int map_xdr(struct svcxprt_rdma *xprt, struct xdr_buf *xdr, struct svc_rdma_req_map *vec) @@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, BUG_ON(xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); - if (xprt->sc_frmr_pg_list_len) - return fast_reg_xdr(xprt, xdr, vec); - /* Skip the first sge, this is for the RPCRDMA header */ sge_no = 1; @@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, } /* Assumptions: - * - We are using FRMR - * - or - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge_bytes = min_t(size_t, bc, vec->sge[xdr_sge_no].iov_len-sge_off); sge[sge_no].length = sge_bytes; - if (!vec->frmr) { - sge[sge_no].addr = - dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - sge[sge_no].addr)) - goto err; - atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; - } else { - sge[sge_no].addr = (unsigned long) - vec->sge[xdr_sge_no].iov_base + sge_off; - sge[sge_no].lkey = vec->frmr->mr->lkey; - } + sge[sge_no].addr = + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; ctxt->count++; - ctxt->frmr = vec->frmr; sge_off = 0; sge_no++; xdr_sge_no++; @@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, return 0; err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(xprt, vec->frmr); svc_rdma_put_context(ctxt, 0); /* Fatal error, close transport */ return -EIO; @@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; @@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - if (vec->frmr) - max_write = vec->frmr->map_len; - else - max_write = xprt->sc_max_sge * PAGE_SIZE; + max_write = xprt->sc_max_sge * PAGE_SIZE; /* xdr offset starts at RPC message */ nchunks = ntohl(arg_ary->wc_nchunks); @@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; - struct ib_send_wr inv_wr; int sge_no; int sge_bytes; int page_no; @@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma, "svcrdma: could not post a receive buffer, err=%d." "Closing transport %p.\n", ret, rdma); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 0); return -ENOTCONN; } @@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; - ctxt->frmr = vec->frmr; - if (vec->frmr) - set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); - else - clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_dma_lkey; @@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma, int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; - if (!vec->frmr) { - ctxt->sge[sge_no].addr = - dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(rdma->sc_cm_id->device, - ctxt->sge[sge_no].addr)) - goto err; - atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; - } else { - ctxt->sge[sge_no].addr = (unsigned long) - vec->sge[sge_no].iov_base; - ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; - } + ctxt->sge[sge_no].addr = + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } BUG_ON(byte_count != 0); @@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[page_no+1].length = 0; } rqstp->rq_next_page = rqstp->rq_respages + 1; + BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; @@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma, send_wr.num_sge = sge_no; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - if (vec->frmr) { - /* Prepare INVALIDATE WR */ - memset(&inv_wr, 0, sizeof inv_wr); - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED; - inv_wr.ex.invalidate_rkey = - vec->frmr->mr->lkey; - send_wr.next = &inv_wr; - } ret = svc_rdma_send(rdma, &send_wr); if (ret) @@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma, err: svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(rdma, vec->frmr); svc_rdma_put_context(ctxt, 1); return -EIO; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 25688fa2207f..e7323fbbd348 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -65,6 +66,7 @@ static void dto_tasklet_func(unsigned long data); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static int svc_rdma_secure_port(struct svc_rqst *); static void rq_cq_reap(struct svcxprt_rdma *xprt); static void sq_cq_reap(struct svcxprt_rdma *xprt); @@ -82,6 +84,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, }; struct svc_xprt_class svc_rdma_class = { @@ -160,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; - map->frmr = NULL; return map; } @@ -336,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt, switch (ctxt->wr_op) { case IB_WR_SEND: - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 1); break; case IB_WR_RDMA_WRITE: + BUG_ON(ctxt->frmr); svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: + svc_rdma_put_frmr(xprt, ctxt->frmr); if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; BUG_ON(!read_hdr); - if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) - svc_rdma_put_frmr(xprt, ctxt->frmr); spin_lock_bh(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); list_add_tail(&read_hdr->dto_q, @@ -363,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt, break; default: + BUG_ON(1); printk(KERN_ERR "svcrdma: unexpected completion type, " "opcode=%d\n", ctxt->wr_op); @@ -378,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt, static void sq_cq_reap(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - struct ib_wc wc; + struct ib_wc wc_a[6]; + struct ib_wc *wc; struct ib_cq *cq = xprt->sc_sq_cq; int ret; + memset(wc_a, 0, sizeof(wc_a)); + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - if (wc.status != IB_WC_SUCCESS) - /* Close the transport */ - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { + int i; - /* Decrement used SQ WR count */ - atomic_dec(&xprt->sc_sq_count); - wake_up(&xprt->sc_send_wait); + for (i = 0; i < ret; i++) { + wc = &wc_a[i]; + if (wc->status != IB_WC_SUCCESS) { + dprintk("svcrdma: sq wc err status %d\n", + wc->status); - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - if (ctxt) - process_context(xprt, ctxt); + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + } - svc_xprt_put(&xprt->sc_xprt); + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + ctxt = (struct svc_rdma_op_ctxt *) + (unsigned long)wc->wr_id; + if (ctxt) + process_context(xprt, ctxt); + + svc_xprt_put(&xprt->sc_xprt); + } } if (ctxt) @@ -993,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) need_dma_mr = 0; break; case RDMA_TRANSPORT_IB: - if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else if (!(devattr.device_cap_flags & + IB_DEVICE_LOCAL_DMA_LKEY)) { need_dma_mr = 1; dma_mr_acc = IB_ACCESS_LOCAL_WRITE; } else @@ -1190,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) container_of(xprt, struct svcxprt_rdma, sc_xprt); /* - * If there are fewer SQ WR available than required to send a - * simple response, return false. - */ - if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) - return 0; - - /* - * ...or there are already waiters on the SQ, + * If there are already waiters on the SQ, * return false. */ if (waitqueue_active(&rdma->sc_send_wait)) @@ -1207,6 +1219,11 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +static int svc_rdma_secure_port(struct svc_rqst *rqstp) +{ + return 1; +} + /* * Attempt to register the kvec representing the RPC memory with the * device. diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1eb9c468d0c9..66f91f0d071a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = { #endif +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void @@ -229,7 +234,6 @@ static void xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; dprintk("RPC: %s: called\n", __func__); @@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_clear_connected(xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", - __func__, rc); + rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); @@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args) /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; - xprt->bind_timeout = (60U * HZ); - xprt->reestablish_timeout = (5U * HZ); - xprt->idle_timeout = (5U * 60 * HZ); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ @@ -391,7 +392,7 @@ out4: xprt_rdma_free_addresses(xprt); rc = -EINVAL; out3: - (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: @@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout > (30 * HZ)) - xprt->reestablish_timeout = (30 * HZ); - else if (xprt->reestablish_timeout < (5 * HZ)) - xprt->reestablish_timeout = (5 * HZ); + if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; + else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) @@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -static int -xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int credits = atomic_read(&r_xprt->rx_buf.rb_credits); - - /* == RPC_CWNDSCALE @ init, but *after* setup */ - if (r_xprt->rx_buf.rb_cwndscale == 0UL) { - r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; - dprintk("RPC: %s: cwndscale %lu\n", __func__, - r_xprt->rx_buf.rb_cwndscale); - BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); - } - xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; - return xprt_reserve_xprt_cong(xprt, task); -} - /* * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv @@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) struct rpcrdma_req *req, *nreq; req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); - BUG_ON(NULL == req); + if (req == NULL) + return NULL; if (size > req->rl_size) { dprintk("RPC: %s: size %zd too large for buffer[%zd]: " @@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) * If the allocation or registration fails, the RPC framework * will (doggedly) retry. */ - if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == - RPCRDMA_BOUNCEBUFFERS) { - /* forced to "pure inline" */ - dprintk("RPC: %s: too much data (%zd) for inline " - "(r/w max %d/%d)\n", __func__, size, - rpcx_to_rdmad(xprt).inline_rsize, - rpcx_to_rdmad(xprt).inline_wsize); - size = req->rl_size; - rpc_exit(task, -EIO); /* fail the operation */ - rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; - goto out; - } if (task->tk_flags & RPC_TASK_SWAPPER) nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); else @@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = nreq; } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); -out: req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; @@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer) __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); /* - * Finish the deregistration. When using mw bind, this was - * begun in rpcrdma_reply_handler(). In all other modes, we - * do it here, in thread context. The process is considered + * Finish the deregistration. The process is considered * complete when the rr_func vector becomes NULL - this * was put in place during rpcrdma_reply_handler() - the wait * call below will not block if the dereg is "done". If @@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); - } - - if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { - rep->rr_func = NULL; /* abandon the callback */ - req->rl_reply = NULL; + &req->rl_segments[i], r_xprt); } if (req->rl_iov.length == 0) { /* see allocate above */ @@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; - /* marshal the send itself */ - if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { - r_xprt->rx_stats.failed_marshal_count++; - dprintk("RPC: %s: rpcrdma_marshal_req failed\n", - __func__); - return -EIO; + if (req->rl_niovs == 0) { + rc = rpcrdma_marshal_req(rqst); + if (rc < 0) + goto failed_marshal; } if (req->rl_reply == NULL) /* e.g. reconnection */ @@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_bytes_sent = 0; return 0; +failed_marshal: + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", + __func__, rc); + if (rc == -EIO) + return -EIO; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ @@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) */ static struct rpc_xprt_ops xprt_rdma_procs = { - .reserve_xprt = xprt_rdma_reserve_xprt, + .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 93726560eaa8..13dbd1c389ff 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -48,8 +48,8 @@ */ #include <linux/interrupt.h> -#include <linux/pci.h> /* for Tavor hack below */ #include <linux/slab.h> +#include <asm/bitops.h> #include "xprt_rdma.h" @@ -142,98 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } -static inline -void rpcrdma_event_process(struct ib_wc *wc) +static void +rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - struct rpcrdma_mw *frmr; - struct rpcrdma_rep *rep = - (struct rpcrdma_rep *)(unsigned long) wc->wr_id; + struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", - __func__, rep, wc->status, wc->opcode, wc->byte_len); + dprintk("RPC: %s: frmr %p status %X opcode %d\n", + __func__, frmr, wc->status, wc->opcode); - if (!rep) /* send or bind completion that we don't care about */ + if (wc->wr_id == 0ULL) return; - - if (IB_WC_SUCCESS != wc->status) { - dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", - __func__, wc->opcode, wc->status); - rep->rr_len = ~0U; - if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) - rpcrdma_schedule_tasklet(rep); + if (wc->status != IB_WC_SUCCESS) return; - } - switch (wc->opcode) { - case IB_WC_FAST_REG_MR: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + if (wc->opcode == IB_WC_FAST_REG_MR) frmr->r.frmr.state = FRMR_IS_VALID; - break; - case IB_WC_LOCAL_INV: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + else if (wc->opcode == IB_WC_LOCAL_INV) frmr->r.frmr.state = FRMR_IS_INVALID; - break; - case IB_WC_RECV: - rep->rr_len = wc->byte_len; - ib_dma_sync_single_for_cpu( - rdmab_to_ia(rep->rr_buffer)->ri_id->device, - rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); - /* Keep (only) the most recent credits, after check validity */ - if (rep->rr_len >= 16) { - struct rpcrdma_msg *p = - (struct rpcrdma_msg *) rep->rr_base; - unsigned int credits = ntohl(p->rm_credit); - if (credits == 0) { - dprintk("RPC: %s: server" - " dropped credits to 0!\n", __func__); - /* don't deadlock */ - credits = 1; - } else if (credits > rep->rr_buffer->rb_max_requests) { - dprintk("RPC: %s: server" - " over-crediting: %d (%d)\n", - __func__, credits, - rep->rr_buffer->rb_max_requests); - credits = rep->rr_buffer->rb_max_requests; - } - atomic_set(&rep->rr_buffer->rb_credits, credits); - } - /* fall through */ - case IB_WC_BIND_MW: - rpcrdma_schedule_tasklet(rep); - break; - default: - dprintk("RPC: %s: unexpected WC event %X\n", - __func__, wc->opcode); - break; - } } -static inline int -rpcrdma_cq_poll(struct ib_cq *cq) +static int +rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { - struct ib_wc wc; - int rc; + struct ib_wc *wcs; + int budget, count, rc; - for (;;) { - rc = ib_poll_cq(cq, 1, &wc); - if (rc < 0) { - dprintk("RPC: %s: ib_poll_cq failed %i\n", - __func__, rc); + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_send_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) return rc; - } - if (rc == 0) - break; - rpcrdma_event_process(&wc); + count = rc; + while (count-- > 0) + rpcrdma_sendcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE && --budget); + return 0; +} + +/* + * Handle send, fast_reg_mr, and local_inv completions. + * + * Send events are typically suppressed and thus do not result + * in an upcall. Occasionally one is signaled, however. This + * prevents the provider's completion queue from wrapping and + * losing a completion. + */ +static void +rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) +{ + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + + rc = rpcrdma_sendcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); + return; } + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + + rpcrdma_sendcq_poll(cq, ep); +} + +static void +rpcrdma_recvcq_process_wc(struct ib_wc *wc) +{ + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long)wc->wr_id; + + dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (wc->status != IB_WC_SUCCESS) { + rep->rr_len = ~0U; + goto out_schedule; + } + if (wc->opcode != IB_WC_RECV) + return; + + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > rep->rr_buffer->rb_max_requests) + credits = rep->rr_buffer->rb_max_requests; + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + +out_schedule: + rpcrdma_schedule_tasklet(rep); +} + +static int +rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +{ + struct ib_wc *wcs; + int budget, count, rc; + + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_recv_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + return rc; + + count = rc; + while (count-- > 0) + rpcrdma_recvcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE && --budget); return 0; } /* - * rpcrdma_cq_event_upcall + * Handle receive completions. * - * This upcall handles recv, send, bind and unbind events. * It is reentrant but processes single events in order to maintain * ordering of receives to keep server credits. * @@ -242,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq) * connection shutdown. That is, the structures required for * the completion of the reply handler must remain intact until * all memory has been reclaimed. - * - * Note that send events are suppressed and do not result in an upcall. */ static void -rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) +rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; int rc; - rc = rpcrdma_cq_poll(cq); - if (rc) + rc = rpcrdma_recvcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); return; + } - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { - dprintk("RPC: %s: ib_req_notify_cq failed %i\n", + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); return; } - rpcrdma_cq_poll(cq); + rpcrdma_recvcq_poll(cq, ep); } #ifdef RPC_DEBUG @@ -493,54 +539,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; } - switch (memreg) { - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: - if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { - dprintk("RPC: %s: MEMWINDOWS registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; - } - break; - case RPCRDMA_MTHCAFMR: - if (!ia->ri_id->device->alloc_fmr) { -#if RPCRDMA_PERSISTENT_REGISTRATION - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); - memreg = RPCRDMA_ALLPHYSICAL; -#else - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; -#endif - } - break; - case RPCRDMA_FRMR: + if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ if ((devattr.device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { -#if RPCRDMA_PERSISTENT_REGISTRATION dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); + "not supported by HCA\n", __func__); + memreg = RPCRDMA_MTHCAFMR; + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + devattr.max_fast_reg_page_list_len); + } + } + if (memreg == RPCRDMA_MTHCAFMR) { + if (!ia->ri_id->device->alloc_fmr) { + dprintk("RPC: %s: MTHCAFMR registration " + "not supported by HCA\n", __func__); +#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; #else - dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; + rc = -ENOMEM; + goto out2; #endif } - break; } /* @@ -552,8 +576,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * adapter. */ switch (memreg) { - case RPCRDMA_BOUNCEBUFFERS: - case RPCRDMA_REGISTER: case RPCRDMA_FRMR: break; #if RPCRDMA_PERSISTENT_REGISTRATION @@ -563,30 +585,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) IB_ACCESS_REMOTE_READ; goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_MW_BIND; - goto register_setup; case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; +#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: +#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n\t" - "Will continue with degraded performance\n", + "phys register failed with %lX\n", __func__, PTR_ERR(ia->ri_bind_mem)); - memreg = RPCRDMA_REGISTER; - ia->ri_bind_mem = NULL; + rc = -ENOMEM; + goto out2; } break; default: - printk(KERN_ERR "%s: invalid memory registration mode %d\n", - __func__, memreg); - rc = -EINVAL; + printk(KERN_ERR "RPC: Unsupported memory " + "registration mode: %d\n", memreg); + rc = -ENOMEM; goto out2; } dprintk("RPC: %s: memory registration strategy is %d\n", @@ -640,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr devattr; + struct ib_cq *sendcq, *recvcq; int rc, err; rc = ib_query_device(ia->ri_id->device, &devattr); @@ -659,32 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRMR: { + int depth = 7; + /* Add room for frmr register and invalidate WRs. * 1. FRMR reg WR for head * 2. FRMR invalidate WR for head - * 3. FRMR reg WR for pagelist - * 4. FRMR invalidate WR for pagelist + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist * 5. FRMR reg WR for tail * 6. FRMR invalidate WR for tail * 7. The RDMA_SEND WR */ - ep->rep_attr.cap.max_send_wr *= 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + int delta = RPCRDMA_MAX_DATA_SEGS - + ia->ri_max_frmr_depth; + + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + + } + ep->rep_attr.cap.max_send_wr *= depth; if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { - cdata->max_requests = devattr.max_qp_wr / 7; + cdata->max_requests = devattr.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; } break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Add room for mw_binds+unbinds - overkill! */ - ep->rep_attr.cap.max_send_wr++; - ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) - return -EINVAL; - break; + } default: break; } @@ -705,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - ep->rep_cqinit -= RPCRDMA_MAX_SEGS; - break; - default: - break; - } + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); ep->rep_ia = ia; init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - /* - * Create a single cq for receive dto and mw_bind (only ever - * care about unbind, really). Send completions are suppressed. - * Use single threaded tasklet upcalls to maintain ordering. - */ - ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, - rpcrdma_cq_async_error_upcall, NULL, - ep->rep_attr.cap.max_recv_wr + + sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, ep, ep->rep_attr.cap.max_send_wr + 1, 0); - if (IS_ERR(ep->rep_cq)) { - rc = PTR_ERR(ep->rep_cq); - dprintk("RPC: %s: ib_create_cq failed: %i\n", + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); + dprintk("RPC: %s: failed to create send CQ: %i\n", __func__, rc); goto out1; } - rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); + rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + + recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_recv_wr + 1, 0); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); + dprintk("RPC: %s: failed to create recv CQ: %i\n", + __func__, rc); + goto out2; + } + + rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); if (rc) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); + ib_destroy_cq(recvcq); goto out2; } - ep->rep_attr.send_cq = ep->rep_cq; - ep->rep_attr.recv_cq = ep->rep_cq; + ep->rep_attr.send_cq = sendcq; + ep->rep_attr.recv_cq = recvcq; /* Initialize cma parameters */ @@ -754,9 +788,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) - ep->rep_remote_cma.responder_resources = 0; - else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; @@ -768,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return 0; out2: - err = ib_destroy_cq(ep->rep_cq); + err = ib_destroy_cq(sendcq); if (err) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); @@ -782,11 +814,8 @@ out1: * Disconnect and destroy endpoint. After this, the only * valid operations on the ep are to free it (if dynamically * allocated) or re-create it. - * - * The caller's error handling must be sure to not leak the endpoint - * if this function fails. */ -int +void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; @@ -794,6 +823,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); + cancel_delayed_work_sync(&ep->rep_connect_worker); + if (ia->ri_id->qp) { rc = rpcrdma_ep_disconnect(ep, ia); if (rc) @@ -809,13 +840,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ep->rep_pad_mr = NULL; } - rpcrdma_clean_cq(ep->rep_cq); - rc = ib_destroy_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); - return rc; + rpcrdma_clean_cq(ep->rep_attr.send_cq); + rc = ib_destroy_cq(ep->rep_attr.send_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); } /* @@ -831,17 +866,20 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: + dprintk("RPC: %s: reconnecting...\n", __func__); rc = rpcrdma_ep_disconnect(ep, ia); if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_cq); + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { - rc = PTR_ERR(id); + rc = -EHOSTUNREACH; goto out; } /* TEMP TEMP TEMP - fail if new device: @@ -855,35 +893,32 @@ retry: printk("RPC: %s: can't reconnect on " "different device!\n", __func__); rdma_destroy_id(id); - rc = -ENETDOWN; + rc = -ENETUNREACH; goto out; } /* END TEMP */ + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + rdma_destroy_id(id); + rc = -ENETUNREACH; + goto out; + } rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; + } else { + dprintk("RPC: %s: connecting...\n", __func__); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + /* do not update ep->rep_connected */ + return -ENETUNREACH; + } } - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - goto out; - } - -/* XXX Tavor device performs badly with 2K MTU! */ -if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { - struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); - if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && - (pcid->vendor == PCI_VENDOR_ID_MELLANOX || - pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { - struct ib_qp_attr attr = { - .path_mtu = IB_MTU_1024 - }; - rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); - } -} - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -944,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -967,7 +1003,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { char *p; - size_t len; + size_t len, rlen, wlen; int i, rc; struct rpcrdma_mw *r; @@ -997,11 +1033,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * sizeof(struct rpcrdma_mw); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; default: break; } @@ -1032,32 +1063,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, } p += cdata->padding; - /* - * Allocate the fmr's, or mw's for mw_bind chunk registration. - * We "cycle" the mw's in order to minimize rkey reuse, - * and also reduce unbind-to-bind collision. - */ INIT_LIST_HEAD(&buf->rb_mws); r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - RPCRDMA_MAX_SEGS); + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_mr)) { rc = PTR_ERR(r->r.frmr.fr_mr); dprintk("RPC: %s: ib_alloc_fast_reg_mr" " failed %i\n", __func__, rc); goto out; } - r->r.frmr.fr_pgl = - ib_alloc_fast_reg_page_list(ia->ri_id->device, - RPCRDMA_MAX_SEGS); + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_pgl)) { rc = PTR_ERR(r->r.frmr.fr_pgl); dprintk("RPC: %s: " "ib_alloc_fast_reg_page_list " "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); goto out; } list_add(&r->mw_list, &buf->rb_mws); @@ -1082,21 +1110,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, ++r; } break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Allocate one extra request's worth, for full cycling */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); - if (IS_ERR(r->r.mw)) { - rc = PTR_ERR(r->r.mw); - dprintk("RPC: %s: ib_alloc_mw" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } - break; default: break; } @@ -1105,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * Allocate/init the request/reply buffers. Doing this * using kmalloc for now -- one for each buf. */ + wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); + rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); + dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", + __func__, wlen, rlen); + for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; struct rpcrdma_rep *rep; - len = cdata->inline_wsize + sizeof(struct rpcrdma_req); - /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ - /* Typical ~2400b, so rounding up saves work later */ - if (len < 4096) - len = 4096; - req = kmalloc(len, GFP_KERNEL); + req = kmalloc(wlen, GFP_KERNEL); if (req == NULL) { dprintk("RPC: %s: request buffer %d alloc" " failed\n", __func__, i); @@ -1126,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, buf->rb_send_bufs[i]->rl_buffer = buf; rc = rpcrdma_register_internal(ia, req->rl_base, - len - offsetof(struct rpcrdma_req, rl_base), + wlen - offsetof(struct rpcrdma_req, rl_base), &buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); if (rc) goto out; - buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); + buf->rb_send_bufs[i]->rl_size = wlen - + sizeof(struct rpcrdma_req); - len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); - rep = kmalloc(len, GFP_KERNEL); + rep = kmalloc(rlen, GFP_KERNEL); if (rep == NULL) { dprintk("RPC: %s: reply buffer %d alloc failed\n", __func__, i); @@ -1145,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, memset(rep, 0, sizeof(struct rpcrdma_rep)); buf->rb_recv_bufs[i] = rep; buf->rb_recv_bufs[i]->rr_buffer = buf; - init_waitqueue_head(&rep->rr_unbind); rc = rpcrdma_register_internal(ia, rep->rr_base, - len - offsetof(struct rpcrdma_rep, rr_base), + rlen - offsetof(struct rpcrdma_rep, rr_base), &buf->rb_recv_bufs[i]->rr_handle, &buf->rb_recv_bufs[i]->rr_iov); if (rc) @@ -1179,7 +1191,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) - * 1a. bind mw memory * 2. send mr memory (mr free, then kfree) * 3. padding (if any) [moved to rpcrdma_ep_destroy] * 4. arrays @@ -1194,41 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_recv_bufs[i]); } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = ib_dealloc_mw(r->r.mw); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_mw" - " failed %i\n", - __func__, rc); - break; - default: - break; - } - } rpcrdma_deregister_internal(ia, buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); @@ -1236,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; + case RPCRDMA_MTHCAFMR: + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_fmr" + " failed %i\n", + __func__, rc); + break; + default: + break; + } + } + kfree(buf->rb_pool); } @@ -1299,21 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) int i; unsigned long flags; - BUG_ON(req->rl_nchunks != 0); spin_lock_irqsave(&buffers->rb_lock, flags); buffers->rb_send_bufs[--buffers->rb_send_index] = req; req->rl_niovs = 0; if (req->rl_reply) { buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - init_waitqueue_head(&req->rl_reply->rr_unbind); req->rl_reply->rr_func = NULL; req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: /* * Cycle mw's back in reverse order, and "spin" them. * This delays and scrambles reuse as much as possible. @@ -1358,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) /* * Put reply buffers back into pool when not attached to - * request. This happens in error conditions, and when - * aborting unbinds. Pre-decrement counter/array index. + * request. This happens in error conditions. */ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) @@ -1498,8 +1496,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, seg1->mr_offset -= pageoff; /* start of page */ seg1->mr_len += pageoff; len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; + if (*nsegs > ia->ri_max_frmr_depth) + *nsegs = ia->ri_max_frmr_depth; for (page_no = i = 0; i < *nsegs;) { rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; @@ -1536,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, } else post_wr = &frmr_wr; - /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); - /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; @@ -1550,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.page_list_len = page_no; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - BUG_ON(frmr_wr.wr.fast_reg.length < len); + if (frmr_wr.wr.fast_reg.length < len) { + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + return -EIO; + } + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); @@ -1661,135 +1664,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, return rc; } -static int -rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct ib_mw_bind param; - int rc; - - *nsegs = 1; - rpcrdma_map_one(ia, seg, writing); - param.bind_info.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.bind_info.addr = seg->mr_dma; - param.bind_info.length = seg->mr_len; - param.send_flags = 0; - param.bind_info.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.bind_info.addr; - seg->mr_nsegs = 1; - } - return rc; -} - -static int -rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt, void **r) -{ - struct ib_mw_bind param; - LIST_HEAD(l); - int rc; - - BUG_ON(seg->mr_nsegs != 1); - param.bind_info.mr = ia->ri_bind_mem; - param.bind_info.addr = 0ULL; /* unbind */ - param.bind_info.length = 0; - param.bind_info.mw_access_flags = 0; - if (*r) { - param.wr_id = (u64) (unsigned long) *r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - *r = NULL; /* will upcall on completion */ - return rc; -} - -static int -rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len, i, rc = 0; - - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (len = 0, i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, i, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - int rc; - - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); - return rc; -} - int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) @@ -1819,16 +1693,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; - /* Registration using memory windows */ - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Default registration each time */ default: - rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); - break; + return -1; } if (rc) return -1; @@ -1838,7 +1704,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt, void *r) + struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; int nsegs = seg->mr_nsegs, rc; @@ -1847,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, #if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: - BUG_ON(nsegs != 1); rpcrdma_unmap_one(ia, seg); - rc = 0; break; #endif @@ -1861,21 +1725,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_deregister_fmr_external(seg, ia); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); - break; - default: - rc = rpcrdma_deregister_default_external(seg, ia); break; } - if (r) { - struct rpcrdma_rep *rep = r; - void (*func)(struct rpcrdma_rep *) = rep->rr_func; - rep->rr_func = NULL; - func(rep); /* dereg done, callback now */ - } return nsegs; } @@ -1950,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, ib_dma_sync_single_for_cpu(ia->ri_id->device, rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); - DECR_CQCOUNT(ep); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); if (rc) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445dc1d1a..89e7cd479705 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -43,6 +43,7 @@ #include <linux/wait.h> /* wait_queue_head_t, etc */ #include <linux/spinlock.h> /* spinlock_t, etc */ #include <linux/atomic.h> /* atomic_t, etc */ +#include <linux/workqueue.h> /* struct work_struct */ #include <rdma/rdma_cm.h> /* RDMA connection api */ #include <rdma/ib_verbs.h> /* RDMA verbs api */ @@ -66,18 +67,21 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; + unsigned int ri_max_frmr_depth; }; /* * RDMA Endpoint -- one per transport instance */ +#define RPCRDMA_WC_BUDGET (128) +#define RPCRDMA_POLLSIZE (16) + struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; int rep_connected; struct rpcrdma_ia *rep_ia; - struct ib_cq *rep_cq; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct ib_sge rep_pad; /* holds zeroed pad */ @@ -86,6 +90,9 @@ struct rpcrdma_ep { struct rpc_xprt *rep_xprt; /* for rep_func */ struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; + struct delayed_work rep_connect_worker; + struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; + struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) @@ -124,7 +131,6 @@ struct rpcrdma_rep { struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ struct list_head rr_list; /* tasklet list */ - wait_queue_head_t rr_unbind; /* optional unbind wait */ struct ib_sge rr_iov; /* for posting */ struct ib_mr *rr_handle; /* handle for mem in rr_iov */ char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ @@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ struct ib_mr *rl_mr; /* if registered directly */ struct rpcrdma_mw { /* if registered from region */ union { - struct ib_mw *mw; struct ib_fmr *fmr; struct { struct ib_fast_reg_page_list *fr_pgl; @@ -207,7 +212,6 @@ struct rpcrdma_req { struct rpcrdma_buffer { spinlock_t rb_lock; /* protects indexes */ atomic_t rb_credits; /* most recent server credits */ - unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; @@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); */ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); -int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); @@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *, int rpcrdma_register_external(struct rpcrdma_mr_seg *, int, int, struct rpcrdma_xprt *); int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *, void *); + struct rpcrdma_xprt *); /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ +void rpcrdma_connect_worker(struct work_struct *); void rpcrdma_conn_func(struct rpcrdma_ep *); void rpcrdma_reply_handler(struct rpcrdma_rep *); diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh new file mode 100755 index 000000000000..515c4c00e957 --- /dev/null +++ b/scripts/decode_stacktrace.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# (c) 2014, Sasha Levin <sasha.levin@oracle.com> +#set -x + +if [[ $# != 2 ]]; then + echo "Usage:" + echo " $0 [vmlinux] [base path]" + exit 1 +fi + +vmlinux=$1 +basepath=$2 +declare -A cache + +parse_symbol() { + # The structure of symbol at this point is: + # [name]+[offset]/[total length] + # + # For example: + # do_basic_setup+0x9c/0xbf + + + # Strip the symbol name so that we could look it up + local name=${symbol%+*} + + # Use 'nm vmlinux' to figure out the base address of said symbol. + # It's actually faster to call it every time than to load it + # all into bash. + if [[ "${cache[$name]+isset}" == "isset" ]]; then + local base_addr=${cache[$name]} + else + local base_addr=$(nm "$vmlinux" | grep -i ' t ' | awk "/ $name\$/ {print \$1}" | head -n1) + cache["$name"]="$base_addr" + fi + # Let's start doing the math to get the exact address into the + # symbol. First, strip out the symbol total length. + local expr=${symbol%/*} + + # Now, replace the symbol name with the base address we found + # before. + expr=${expr/$name/0x$base_addr} + + # Evaluate it to find the actual address + expr=$((expr)) + local address=$(printf "%x\n" "$expr") + + # Pass it to addr2line to get filename and line number + # Could get more than one result + if [[ "${cache[$address]+isset}" == "isset" ]]; then + local code=${cache[$address]} + else + local code=$(addr2line -i -e "$vmlinux" "$address") + cache[$address]=$code + fi + + # addr2line doesn't return a proper error code if it fails, so + # we detect it using the value it prints so that we could preserve + # the offset/size into the function and bail out + if [[ $code == "??:0" ]]; then + return + fi + + # Strip out the base of the path + code=${code//$basepath/""} + + # In the case of inlines, move everything to same line + code=${code//$'\n'/' '} + + # Replace old address with pretty line numbers + symbol="$name ($code)" +} + +decode_code() { + local scripts=`dirname "${BASH_SOURCE[0]}"` + + echo "$1" | $scripts/decodecode +} + +handle_line() { + local words + + # Tokenize + read -a words <<<"$1" + + # Remove hex numbers. Do it ourselves until it happens in the + # kernel + + # We need to know the index of the last element before we + # remove elements because arrays are sparse + local last=$(( ${#words[@]} - 1 )) + + for i in "${!words[@]}"; do + # Remove the address + if [[ ${words[$i]} =~ \[\<([^]]+)\>\] ]]; then + unset words[$i] + fi + + # Format timestamps with tabs + if [[ ${words[$i]} == \[ && ${words[$i+1]} == *\] ]]; then + unset words[$i] + words[$i+1]=$(printf "[%13s\n" "${words[$i+1]}") + fi + done + + # The symbol is the last element, process it + symbol=${words[$last]} + unset words[$last] + parse_symbol # modifies $symbol + + # Add up the line number to the symbol + echo "${words[@]}" "$symbol" +} + +while read line; do + # Let's see if we have an address in the line + if [[ $line =~ \[\<([^]]+)\>\] ]]; then + # Translate address to line numbers + handle_line "$line" + # Is it a code line? + elif [[ $line == *Code:* ]]; then + decode_code "$line" + else + # Nothing special in this line, show it as is + echo "$line" + fi +done |