diff options
418 files changed, 16954 insertions, 5571 deletions
diff --git a/Documentation/core-api/packing.rst b/Documentation/core-api/packing.rst index 821691f23c54..0ce2078c8e13 100644 --- a/Documentation/core-api/packing.rst +++ b/Documentation/core-api/packing.rst @@ -227,11 +227,119 @@ Intended use Drivers that opt to use this API first need to identify which of the above 3 quirk combinations (for a total of 8) match what the hardware documentation -describes. Then they should wrap the packing() function, creating a new -xxx_packing() that calls it using the proper QUIRK_* one-hot bits set. +describes. + +There are 3 supported usage patterns, detailed below. + +packing() +^^^^^^^^^ + +This API function is deprecated. The packing() function returns an int-encoded error code, which protects the programmer against incorrect API use. The errors are not expected to occur -during runtime, therefore it is reasonable for xxx_packing() to return void -and simply swallow those errors. Optionally it can dump stack or print the -error description. +during runtime, therefore it is reasonable to wrap packing() into a custom +function which returns void and swallows those errors. Optionally it can +dump stack or print the error description. + +.. code-block:: c + + void my_packing(void *buf, u64 *val, int startbit, int endbit, + size_t len, enum packing_op op) + { + int err; + + /* Adjust quirks accordingly */ + err = packing(buf, val, startbit, endbit, len, op, QUIRK_LSW32_IS_FIRST); + if (likely(!err)) + return; + + if (err == -EINVAL) { + pr_err("Start bit (%d) expected to be larger than end (%d)\n", + startbit, endbit); + } else if (err == -ERANGE) { + if ((startbit - endbit + 1) > 64) + pr_err("Field %d-%d too large for 64 bits!\n", + startbit, endbit); + else + pr_err("Cannot store %llx inside bits %d-%d (would truncate)\n", + *val, startbit, endbit); + } + dump_stack(); + } + +pack() and unpack() +^^^^^^^^^^^^^^^^^^^ + +These are const-correct variants of packing(), and eliminate the last "enum +packing_op op" argument. + +Calling pack(...) is equivalent, and preferred, to calling packing(..., PACK). + +Calling unpack(...) is equivalent, and preferred, to calling packing(..., UNPACK). + +pack_fields() and unpack_fields() +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The library exposes optimized functions for the scenario where there are many +fields represented in a buffer, and it encourages consumer drivers to avoid +repetitive calls to pack() and unpack() for each field, but instead use +pack_fields() and unpack_fields(), which reduces the code footprint. + +These APIs use field definitions in arrays of ``struct packed_field_u8`` or +``struct packed_field_u16``, allowing consumer drivers to minimize the size +of these arrays according to their custom requirements. + +The pack_fields() and unpack_fields() API functions are actually macros which +automatically select the appropriate function at compile time, based on the +type of the fields array passed in. + +An additional benefit over pack() and unpack() is that sanity checks on the +field definitions are handled at compile time with ``BUILD_BUG_ON`` rather +than only when the offending code is executed. These functions return void and +wrapping them to handle unexpected errors is not necessary. + +It is recommended, but not required, that you wrap your packed buffer into a +structured type with a fixed size. This generally makes it easier for the +compiler to enforce that the correct size buffer is used. + +Here is an example of how to use the fields APIs: + +.. code-block:: c + + /* Ordering inside the unpacked structure is flexible and can be different + * from the packed buffer. Here, it is optimized to reduce padding. + */ + struct data { + u64 field3; + u32 field4; + u16 field1; + u8 field2; + }; + + #define SIZE 13 + + typdef struct __packed { u8 buf[SIZE]; } packed_buf_t; + + static const struct packed_field_u8 fields[] = { + PACKED_FIELD(100, 90, struct data, field1), + PACKED_FIELD(90, 87, struct data, field2), + PACKED_FIELD(86, 30, struct data, field3), + PACKED_FIELD(29, 0, struct data, field4), + }; + + void unpack_your_data(const packed_buf_t *buf, struct data *unpacked) + { + BUILD_BUG_ON(sizeof(*buf) != SIZE; + + unpack_fields(buf, sizeof(*buf), unpacked, fields, + QUIRK_LITTLE_ENDIAN); + } + + void pack_your_data(const struct data *unpacked, packed_buf_t *buf) + { + BUILD_BUG_ON(sizeof(*buf) != SIZE; + + pack_fields(buf, sizeof(*buf), unpacked, fields, + QUIRK_LITTLE_ENDIAN); + } diff --git a/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml b/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml new file mode 100644 index 000000000000..2b8b74c5feec --- /dev/null +++ b/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright 2021-2024 NXP +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/nxp,s32-dwmac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP S32G2xx/S32G3xx/S32R45 GMAC ethernet controller + +maintainers: + - Jan Petrous (OSS) <jan.petrous@oss.nxp.com> + +description: + This device is a Synopsys DWC IP, integrated on NXP S32G/R SoCs. + The SoC series S32G2xx and S32G3xx feature one DWMAC instance, + the SoC S32R45 has two instances. The devices can use RGMII/RMII/MII + interface over Pinctrl device or the output can be routed + to the embedded SerDes for SGMII connectivity. + +properties: + compatible: + oneOf: + - const: nxp,s32g2-dwmac + - items: + - enum: + - nxp,s32g3-dwmac + - nxp,s32r45-dwmac + - const: nxp,s32g2-dwmac + + reg: + items: + - description: Main GMAC registers + - description: GMAC PHY mode control register + + interrupts: + maxItems: 1 + + interrupt-names: + const: macirq + + clocks: + items: + - description: Main GMAC clock + - description: Transmit clock + - description: Receive clock + - description: PTP reference clock + + clock-names: + items: + - const: stmmaceth + - const: tx + - const: rx + - const: ptp_ref + +required: + - clocks + - clock-names + +allOf: + - $ref: snps,dwmac.yaml# + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/arm-gic.h> + #include <dt-bindings/interrupt-controller/irq.h> + #include <dt-bindings/phy/phy.h> + bus { + #address-cells = <2>; + #size-cells = <2>; + + ethernet@4033c000 { + compatible = "nxp,s32g2-dwmac"; + reg = <0x0 0x4033c000 0x0 0x2000>, /* gmac IP */ + <0x0 0x4007c004 0x0 0x4>; /* GMAC_0_CTRL_STS */ + interrupt-parent = <&gic>; + interrupts = <GIC_SPI 57 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + snps,mtl-rx-config = <&mtl_rx_setup>; + snps,mtl-tx-config = <&mtl_tx_setup>; + clocks = <&clks 24>, <&clks 17>, <&clks 16>, <&clks 15>; + clock-names = "stmmaceth", "tx", "rx", "ptp_ref"; + phy-mode = "rgmii-id"; + phy-handle = <&phy0>; + + mtl_rx_setup: rx-queues-config { + snps,rx-queues-to-use = <5>; + }; + + mtl_tx_setup: tx-queues-config { + snps,tx-queues-to-use = <5>; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@0 { + reg = <0>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index eb1f3ae41ab9..91e75eb3f329 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -67,6 +67,7 @@ properties: - ingenic,x2000-mac - loongson,ls2k-dwmac - loongson,ls7a-dwmac + - nxp,s32g2-dwmac - qcom,qcs404-ethqos - qcom,sa8775p-ethqos - qcom,sc8280xp-ethqos diff --git a/Documentation/devicetree/bindings/net/ti,dp83822.yaml b/Documentation/devicetree/bindings/net/ti,dp83822.yaml index 784866ea392b..50c24248df26 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83822.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83822.yaml @@ -96,6 +96,32 @@ properties: - master - slave + ti,gpio2-clk-out: + description: | + DP83822 PHY only. + The GPIO2 pin on the DP83822 can be configured as clock output. When + omitted, the PHY's default will be left as is. + + - 'mac-if': In MII mode the clock frequency is 25-MHz, in RMII Mode the + clock frequency is 50-MHz and in RGMII Mode the clock frequency is + 25-MHz. + - 'xi': XI clock(pass-through clock from XI pin). + - 'int-ref': Internal reference clock 25-MHz. + - 'rmii-master-mode-ref': RMII master mode reference clock 50-MHz. RMII + master mode reference clock is identical to MAC IF clock in RMII master + mode. + - 'free-running': Free running clock 125-MHz. + - 'recovered': Recovered clock is a 125-MHz recovered clock from a + connected link partner. + $ref: /schemas/types.yaml#/definitions/string + enum: + - mac-if + - xi + - int-ref + - rmii-master-mode-ref + - free-running + - recovered + required: - reg @@ -110,6 +136,7 @@ examples: reg = <0>; rx-internal-delay-ps = <1>; tx-internal-delay-ps = <1>; + ti,gpio2-clk-out = "xi"; }; }; diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 4f803eaac6d8..9660ffb1ed6a 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -106,6 +106,9 @@ properties: name-prefix: description: For enum the prefix of the values, optional. type: string + enum-cnt-name: + description: Name of the render-max counter enum entry. + type: string # End genetlink-c attribute-sets: diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 8db0e22fa72c..16380e12cabe 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -117,6 +117,9 @@ properties: name-prefix: description: For enum the prefix of the values, optional. type: string + enum-cnt-name: + description: Name of the render-max counter enum entry. + type: string # End genetlink-c # Start genetlink-legacy members: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index 914aa1c0a273..1b0772c8e333 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -221,7 +221,7 @@ properties: type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, - u8, u16, u32, u64, s8, s16, s32, s64, + uint, sint, u8, u16, u32, u64, s8, s16, s32, s64, string, nest, indexed-array, nest-type-value, sub-message ] doc: diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 93369f0eb816..60f85fbf4156 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -5,6 +5,7 @@ name: ethtool protocol: genetlink-legacy doc: Partial family for Ethtool Netlink. +uapi-header: linux/ethtool_netlink_generated.h definitions: - @@ -12,44 +13,100 @@ definitions: enum-name: type: enum entries: [ vxlan, geneve, vxlan-gpe ] + enum-cnt-name: __ethtool-udp-tunnel-type-cnt + render-max: true - name: stringset type: enum entries: [] + header: linux/ethtool.h # skip rendering, no actual definition - name: header-flags type: flags - entries: [ compact-bitsets, omit-reply, stats ] + name-prefix: ethtool-flag- + doc: common ethtool header flags + entries: + - + name: compact-bitsets + doc: use compact bitsets in reply + - + name: omit-reply + doc: provide optional reply for SET or ACT requests + - + name: stats + doc: request statistics, if supported by the driver - name: module-fw-flash-status type: enum - entries: [ started, in_progress, completed, error ] + doc: plug-in module firmware flashing status + header: linux/ethtool.h + entries: + - + name: started + doc: The firmware flashing process has started. + - + name: in_progress + doc: The firmware flashing process is in progress. + - + name: completed + doc: The firmware flashing process was completed successfully. + - + name: error + doc: The firmware flashing process was stopped due to an error. - name: c33-pse-ext-state - enum-name: + doc: "groups of PSE extended states functions. IEEE 802.3-2022 33.2.4.4 Variables" type: enum name-prefix: ethtool-c33-pse-ext-state- + header: linux/ethtool.h entries: - - none - - error-condition - - mr-mps-valid - - mr-pse-enable - - option-detect-ted - - option-vport-lim - - ovld-detected - - power-not-available - - short-detected + - + name: none + doc: none + - + name: error-condition + doc: Group of error_condition states + - + name: mr-mps-valid + doc: Group of mr_mps_valid states + - + name: mr-pse-enable + doc: Group of mr_pse_enable states + - + name: option-detect-ted + doc: Group of option_detect_ted states + - + name: option-vport-lim + doc: Group of option_vport_lim states + - + name: ovld-detected + doc: Group of ovld_detected states + - + name: power-not-available + doc: Group of power_not_available states + - + name: short-detected + doc: Group of short_detected states - name: phy-upstream-type enum-name: type: enum entries: [ mac, phy ] + - + name: tcp-data-split + type: enum + entries: [ unknown, disabled, enabled ] attribute-sets: - name: header + attr-cnt-name: __ethtool-a-header-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: dev-index type: u32 - @@ -65,8 +122,13 @@ attribute-sets: - name: bitset-bit + attr-cnt-name: __ethtool-a-bitset-bit-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: index type: u32 - @@ -77,16 +139,26 @@ attribute-sets: type: flag - name: bitset-bits + attr-cnt-name: __ethtool-a-bitset-bits-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: bit type: nest multi-attr: true nested-attributes: bitset-bit - name: bitset + attr-cnt-name: __ethtool-a-bitset-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: nomask type: flag - @@ -104,8 +176,13 @@ attribute-sets: type: binary - name: string + attr-cnt-name: __ethtool-a-string-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: index type: u32 - @@ -113,16 +190,30 @@ attribute-sets: type: string - name: strings + attr-cnt-name: __ethtool-a-strings-cnt attributes: - + name: unspec + type: unused + value: 0 + - + name: unspec + type: unused + value: 0 + - name: string type: nest multi-attr: true nested-attributes: string - name: stringset + attr-cnt-name: __ethtool-a-stringset-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: id type: u32 - @@ -135,16 +226,26 @@ attribute-sets: nested-attributes: strings - name: stringsets + attr-cnt-name: __ethtool-a-stringsets-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: stringset type: nest multi-attr: true nested-attributes: stringset - name: strset + attr-cnt-name: __ethtool-a-strset-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -158,8 +259,13 @@ attribute-sets: - name: privflags + attr-cnt-name: __ethtool-a-privflags-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -170,8 +276,13 @@ attribute-sets: - name: rings + attr-cnt-name: __ethtool-a-rings-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -205,6 +316,7 @@ attribute-sets: - name: tcp-data-split type: u8 + enum: tcp-data-split - name: cqe-size type: u32 @@ -223,32 +335,49 @@ attribute-sets: - name: mm-stat + attr-cnt-name: __ethtool-a-mm-stat-cnt + doc: MAC Merge (802.3) attributes: - + name: unspec + type: unused + value: 0 + - name: pad type: pad - name: reassembly-errors + doc: aMACMergeFrameAssErrorCount type: u64 - name: smd-errors + doc: aMACMergeFrameSmdErrorCount type: u64 - name: reassembly-ok + doc: aMACMergeFrameAssOkCount type: u64 - name: rx-frag-count + doc: aMACMergeFragCountRx type: u64 - name: tx-frag-count + doc: aMACMergeFragCountTx type: u64 - name: hold-count + doc: aMACMergeHoldCount type: u64 - name: mm + attr-cnt-name: __ethtool-a-mm-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -285,8 +414,13 @@ attribute-sets: nested-attributes: mm-stat - name: linkinfo + attr-cnt-name: __ethtool-a-linkinfo-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -307,8 +441,13 @@ attribute-sets: type: u8 - name: linkmodes + attr-cnt-name: __ethtool-a-linkmodes-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -343,8 +482,13 @@ attribute-sets: type: u8 - name: linkstate + attr-cnt-name: __ethtool-a-linkstate-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -368,8 +512,13 @@ attribute-sets: type: u32 - name: debug + attr-cnt-name: __ethtool-a-debug-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -379,8 +528,13 @@ attribute-sets: nested-attributes: bitset - name: wol + attr-cnt-name: __ethtool-a-wol-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -393,8 +547,13 @@ attribute-sets: type: binary - name: features + attr-cnt-name: __ethtool-a-features-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -416,8 +575,13 @@ attribute-sets: nested-attributes: bitset - name: channels + attr-cnt-name: __ethtool-a-channels-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -448,8 +612,13 @@ attribute-sets: - name: irq-moderation + attr-cnt-name: __ethtool-a-irq-moderation-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: usec type: u32 - @@ -460,16 +629,26 @@ attribute-sets: type: u32 - name: profile + attr-cnt-name: __ethtool-a-profile-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: irq-moderation type: nest multi-attr: true nested-attributes: irq-moderation - name: coalesce + attr-cnt-name: __ethtool-a-coalesce-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -565,8 +744,13 @@ attribute-sets: - name: pause-stat + attr-cnt-name: __ethtool-a-pause-stat-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pad type: pad - @@ -577,8 +761,13 @@ attribute-sets: type: u64 - name: pause + attr-cnt-name: __ethtool-a-pause-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -600,8 +789,13 @@ attribute-sets: type: u32 - name: eee + attr-cnt-name: __ethtool-a-eee-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -627,8 +821,13 @@ attribute-sets: type: u32 - name: ts-stat + attr-cnt-name: __ethtool-a-ts-stat-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: tx-pkts type: uint - @@ -638,9 +837,28 @@ attribute-sets: name: tx-err type: uint - + name: ts-hwtstamp-provider + attr-cnt-name: __ethtool-a-ts-hwtstamp-provider-cnt + attributes: + - + name: unspec + type: unused + value: 0 + - + name: index + type: u32 + - + name: qualifier + type: u32 + - name: tsinfo + attr-cnt-name: __ethtool-a-tsinfo-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -663,22 +881,39 @@ attribute-sets: name: stats type: nest nested-attributes: ts-stat + - + name: hwtstamp-provider + type: nest + nested-attributes: ts-hwtstamp-provider - name: cable-result + attr-cnt-name: __ethtool-a-cable-result-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pair + doc: ETHTOOL_A_CABLE_PAIR type: u8 - name: code + doc: ETHTOOL_A_CABLE_RESULT_CODE type: u8 - name: src + doc: ETHTOOL_A_CABLE_INF_SRC type: u32 - name: cable-fault-length + attr-cnt-name: __ethtool-a-cable-fault-length-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pair type: u8 - @@ -689,8 +924,13 @@ attribute-sets: type: u32 - name: cable-nest + attr-cnt-name: __ethtool-a-cable-nest-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: result type: nest nested-attributes: cable-result @@ -700,20 +940,31 @@ attribute-sets: nested-attributes: cable-fault-length - name: cable-test + attr-cnt-name: __ethtool-a-cable-test-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header - name: cable-test-ntf + attr-cnt-name: __ethtool-a-cable-test-ntf-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header - name: status + doc: _STARTED/_COMPLETE type: u8 - name: nest @@ -721,8 +972,13 @@ attribute-sets: nested-attributes: cable-nest - name: cable-test-tdr-cfg + attr-cnt-name: __ethtool-a-cable-test-tdr-cfg-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: first type: u32 - @@ -736,8 +992,13 @@ attribute-sets: type: u8 - name: cable-test-tdr-ntf + attr-cnt-name: __ethtool-a-cable-test-tdr-ntf-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -750,8 +1011,13 @@ attribute-sets: nested-attributes: cable-nest - name: cable-test-tdr + attr-cnt-name: __ethtool-a-cable-test-tdr-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -761,8 +1027,13 @@ attribute-sets: nested-attributes: cable-test-tdr-cfg - name: tunnel-udp-entry + attr-cnt-name: __ethtool-a-tunnel-udp-entry-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: port type: u16 byte-order: big-endian @@ -772,8 +1043,13 @@ attribute-sets: enum: udp-tunnel-type - name: tunnel-udp-table + attr-cnt-name: __ethtool-a-tunnel-udp-table-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: size type: u32 - @@ -787,15 +1063,25 @@ attribute-sets: nested-attributes: tunnel-udp-entry - name: tunnel-udp + attr-cnt-name: __ethtool-a-tunnel-udp-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: table type: nest nested-attributes: tunnel-udp-table - name: tunnel-info + attr-cnt-name: __ethtool-a-tunnel-info-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -805,8 +1091,13 @@ attribute-sets: nested-attributes: tunnel-udp - name: fec-stat + attr-cnt-name: __ethtool-a-fec-stat-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pad type: pad - @@ -823,8 +1114,13 @@ attribute-sets: sub-type: u64 - name: fec + attr-cnt-name: __ethtool-a-fec-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -844,8 +1140,13 @@ attribute-sets: nested-attributes: fec-stat - name: module-eeprom + attr-cnt-name: __ethtool-a-module-eeprom-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -869,8 +1170,13 @@ attribute-sets: type: binary - name: stats-grp + attr-cnt-name: __ethtool-a-stats-grp-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pad type: pad - @@ -912,8 +1218,13 @@ attribute-sets: name: hist-val - name: stats + attr-cnt-name: __ethtool-a-stats-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: pad type: pad - @@ -933,8 +1244,13 @@ attribute-sets: type: u32 - name: phc-vclocks + attr-cnt-name: __ethtool-a-phc-vclocks-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -947,8 +1263,13 @@ attribute-sets: sub-type: s32 - name: module + attr-cnt-name: __ethtool-a-module-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -960,8 +1281,14 @@ attribute-sets: type: u8 - name: c33-pse-pw-limit + attr-cnt-name: __ethtool-a-c33-pse-pw-limit-cnt + attr-max-name: __ethtool-a-c33-pse-pw-limit-max attributes: - + name: unspec + type: unused + value: 0 + - name: min type: u32 - @@ -969,8 +1296,13 @@ attribute-sets: type: u32 - name: pse + attr-cnt-name: __ethtool-a-pse-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -1027,8 +1359,13 @@ attribute-sets: nested-attributes: c33-pse-pw-limit - name: rss + attr-cnt-name: __ethtool-a-rss-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -1053,8 +1390,13 @@ attribute-sets: type: u32 - name: plca + attr-cnt-name: __ethtool-a-plca-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -1084,8 +1426,13 @@ attribute-sets: type: u32 - name: module-fw-flash + attr-cnt-name: __ethtool-a-module-fw-flash-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -1110,8 +1457,13 @@ attribute-sets: type: uint - name: phy + attr-cnt-name: __ethtool-a-phy-cnt attributes: - + name: unspec + type: unused + value: 0 + - name: header type: nest nested-attributes: header @@ -1137,6 +1489,33 @@ attribute-sets: - name: downstream-sfp-name type: string + - + name: tsconfig + attr-cnt-name: __ethtool-a-tsconfig-cnt + attributes: + - + name: unspec + type: unused + value: 0 + - + name: header + type: nest + nested-attributes: header + - + name: hwtstamp-provider + type: nest + nested-attributes: ts-hwtstamp-provider + - + name: tx-types + type: nest + nested-attributes: bitset + - + name: rx-filters + type: nest + nested-attributes: bitset + - + name: hwtstamp-flags + type: u32 operations: enum-model: directional @@ -1578,6 +1957,7 @@ operations: request: attributes: - header + - hwtstamp-provider reply: attributes: - header @@ -1586,6 +1966,7 @@ operations: - rx-filters - phc-index - stats + - hwtstamp-provider dump: *tsinfo-get-op - name: cable-test-act @@ -1960,3 +2341,32 @@ operations: name: phy-ntf doc: Notification for change in PHY devices. notify: phy-get + - + name: tsconfig-get + doc: Get hwtstamp config. + + attribute-set: tsconfig + + do: &tsconfig-get-op + request: + attributes: + - header + reply: + attributes: &tsconfig + - header + - hwtstamp-provider + - tx-types + - rx-filters + - hwtstamp-flags + dump: *tsconfig-get-op + - + name: tsconfig-set + doc: Set hwtstamp config. + + attribute-set: tsconfig + + do: + request: + attributes: *tsconfig + reply: + attributes: *tsconfig diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 9ffa13b77dcf..96465376d6fe 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -2086,6 +2086,9 @@ attribute-sets: - name: mctp-net type: u32 + - + name: phys-binding + type: u8 - name: stats-attrs name-prefix: ifla-stats- diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt_route.yaml index f4368be0caed..a674103e5bc4 100644 --- a/Documentation/netlink/specs/rt_route.yaml +++ b/Documentation/netlink/specs/rt_route.yaml @@ -177,6 +177,11 @@ attribute-sets: - name: rta-nh-id type: u32 + - + name: rta-flowlabel + type: u32 + byte-order: big-endian + display-hint: hex - name: rta-metrics attributes: @@ -260,6 +265,7 @@ operations: - rta-dport - rta-mark - rta-uid + - rta-flowlabel reply: value: 24 attributes: &all-route-attrs @@ -299,6 +305,7 @@ operations: - rta-sport - rta-dport - rta-nh-id + - rta-flowlabel dump: request: value: 26 diff --git a/Documentation/netlink/specs/rt_rule.yaml b/Documentation/netlink/specs/rt_rule.yaml index 03a8eef7952e..a9debac3058a 100644 --- a/Documentation/netlink/specs/rt_rule.yaml +++ b/Documentation/netlink/specs/rt_rule.yaml @@ -172,6 +172,16 @@ attribute-sets: - name: dscp type: u8 + - + name: flowlabel + type: u32 + byte-order: big-endian + display-hint: hex + - + name: flowlabel-mask + type: u32 + byte-order: big-endian + display-hint: hex operations: enum-model: directional @@ -203,6 +213,8 @@ operations: - sport-range - dport-range - dscp + - flowlabel + - flowlabel-mask - name: newrule-ntf doc: Notify a rule creation diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 7c8d22d68682..a4c1291d2561 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -1963,7 +1963,7 @@ obtain its hardware address from the first slave, which might not match the hardware address of the VLAN interfaces (which was ultimately copied from an earlier slave). -There are two methods to insure that the VLAN device operates +There are two methods to ensure that the VLAN device operates with the correct hardware address if all slaves are removed from a bond interface: @@ -2078,7 +2078,7 @@ as an unsolicited ARP reply (because ARP matches replies on an interface basis), and is discarded. The MII monitor is not affected by the state of the routing table. -The solution here is simply to insure that slaves do not have +The solution here is simply to ensure that slaves do not have routes of their own, and if for some reason they must, those routes do not supersede routes of their master. This should generally be the case, but unusual configurations or errant manual or automatic static @@ -2295,7 +2295,7 @@ active-backup: the switches have an ISL and play together well. If the network configuration is such that one switch is specifically a backup switch (e.g., has lower capacity, higher cost, etc), - then the primary option can be used to insure that the + then the primary option can be used to ensure that the preferred link is always used when it is available. broadcast: @@ -2322,7 +2322,7 @@ monitor can provide a higher level of reliability in detecting end to end connectivity failures (which may be caused by the failure of any individual component to pass traffic for any reason). Additionally, the ARP monitor should be configured with multiple targets (at least -one for each switch in the network). This will insure that, +one for each switch in the network). This will ensure that, regardless of which switch is active, the ARP monitor has a suitable target to query. diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index b25926071ece..a7ba6368a4d5 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -237,6 +237,8 @@ Userspace to kernel: ``ETHTOOL_MSG_MM_SET`` set MAC merge layer parameters ``ETHTOOL_MSG_MODULE_FW_FLASH_ACT`` flash transceiver module firmware ``ETHTOOL_MSG_PHY_GET`` get Ethernet PHY information + ``ETHTOOL_MSG_TSCONFIG_GET`` get hw timestamping configuration + ``ETHTOOL_MSG_TSCONFIG_SET`` set hw timestamping configuration ===================================== ================================= Kernel to userspace: @@ -286,6 +288,8 @@ Kernel to userspace: ``ETHTOOL_MSG_MODULE_FW_FLASH_NTF`` transceiver module flash updates ``ETHTOOL_MSG_PHY_GET_REPLY`` Ethernet PHY information ``ETHTOOL_MSG_PHY_NTF`` Ethernet PHY information change + ``ETHTOOL_MSG_TSCONFIG_GET_REPLY`` hw timestamping configuration + ``ETHTOOL_MSG_TSCONFIG_SET_REPLY`` new hw timestamping configuration ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1245,9 +1249,10 @@ Gets timestamping information like ``ETHTOOL_GET_TS_INFO`` ioctl request. Request contents: - ===================================== ====== ========================== - ``ETHTOOL_A_TSINFO_HEADER`` nested request header - ===================================== ====== ========================== + ======================================== ====== ============================ + ``ETHTOOL_A_TSINFO_HEADER`` nested request header + ``ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ======================================== ====== ============================ Kernel response contents: @@ -2243,6 +2248,75 @@ Kernel response contents: When ``ETHTOOL_A_PHY_UPSTREAM_TYPE`` is PHY_UPSTREAM_PHY, the PHY's parent is another PHY. +TSCONFIG_GET +============ + +Retrieves the information about the current hardware timestamping source and +configuration. + +It is similar to the deprecated ``SIOCGHWTSTAMP`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +When set the ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` attribute identifies the +source of the hw timestamping provider. It is composed by +``ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX`` attribute which describe the index of +the PTP device and ``ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER`` which describe +the qualifier of the timestamp. + +When set the ``ETHTOOL_A_TSCONFIG_TX_TYPES``, ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` +and the ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` attributes identify the Tx +type, the Rx filter and the flags configured for the current hw timestamping +provider. The attributes are propagated to the driver through the following +structure: + +.. kernel-doc:: include/linux/net_tstamp.h + :identifiers: kernel_hwtstamp_config + +TSCONFIG_SET +============ + +Set the information about the current hardware timestamping source and +configuration. + +It is similar to the deprecated ``SIOCSHWTSTAMP`` ioctl request. + +Request contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +Kernel response contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +For a description of each attribute, see ``TSCONFIG_GET``. + Request translation =================== @@ -2351,4 +2425,6 @@ are netlink only. n/a ``ETHTOOL_MSG_MM_SET`` n/a ``ETHTOOL_MSG_MODULE_FW_FLASH_ACT`` n/a ``ETHTOOL_MSG_PHY_GET`` + ``SIOCGHWTSTAMP`` ``ETHTOOL_MSG_TSCONFIG_GET`` + ``SIOCSHWTSTAMP`` ``ETHTOOL_MSG_TSCONFIG_SET`` =================================== ===================================== diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index dcbb6f6caf6d..363b4950d542 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER Default: 2 +tcp_tw_reuse_delay - UNSIGNED INTEGER + The delay in milliseconds before a TIME-WAIT socket can be reused by a + new connection, if TIME-WAIT socket reuse is enabled. The actual reuse + threshold is within [N, N+1] range, where N is the requested delay in + milliseconds, to ensure the delay interval is never shorter than the + configured value. + + This setting contains an assumption about the other TCP timestamp clock + tick interval. It should not be set to a value lower than the peer's + clock tick for PAWS (Protection Against Wrapped Sequence numbers) + mechanism work correctly for the reused connection. + + Default: 1000 (milliseconds) + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index 629da6dc6d74..de0263302f16 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1 u8 sysctl_tcp_retries2 u8 sysctl_tcp_orphan_retries u8 sysctl_tcp_tw_reuse timewait_sock_ops +unsigned_int sysctl_tcp_tw_reuse_delay timewait_sock_ops int sysctl_tcp_fin_timeout TCP_LAST_ACK/tcp_rcv_state_process unsigned_int sysctl_tcp_notsent_lowat read_mostly tcp_notsent_lowat/tcp_stream_memory_free u8 sysctl_tcp_sack tcp_syn_options diff --git a/Documentation/networking/netconsole.rst b/Documentation/networking/netconsole.rst index d55c2a22ec7a..94c4680fdf3e 100644 --- a/Documentation/networking/netconsole.rst +++ b/Documentation/networking/netconsole.rst @@ -124,7 +124,7 @@ To remove a target:: The interface exposes these parameters of a netconsole target to userspace: - ============== ================================= ============ + =============== ================================= ============ enabled Is this target currently enabled? (read-write) extended Extended mode enabled (read-write) release Prepend kernel release to message (read-write) @@ -135,7 +135,8 @@ The interface exposes these parameters of a netconsole target to userspace: remote_ip Remote agent's IP address (read-write) local_mac Local interface's MAC address (read-only) remote_mac Remote agent's MAC address (read-write) - ============== ================================= ============ + transmit_errors Number of packet send errors (read-only) + =============== ================================= ============ The "enabled" attribute is also used to control whether the parameters of a target can be updated or not -- you can modify the parameters of only diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index b37bfbfc7d79..61ef9da10e28 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -525,8 +525,8 @@ implicitly defined. ts[0] holds a software timestamp if set, ts[1] is again deprecated and ts[2] holds a hardware timestamp if set. -3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP -======================================================================= +3. Hardware Timestamping configuration: ETHTOOL_MSG_TSCONFIG_SET/GET +==================================================================== Hardware time stamping must also be initialized for each device driver that is expected to do hardware time stamping. The parameter is defined in @@ -539,12 +539,14 @@ include/uapi/linux/net_tstamp.h as:: }; Desired behavior is passed into the kernel and to a specific device by -calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose -ifr_data points to a struct hwtstamp_config. The tx_type and -rx_filter are hints to the driver what it is expected to do. If -the requested fine-grained filtering for incoming packets is not -supported, the driver may time stamp more than just the requested types -of packets. +calling the tsconfig netlink socket ``ETHTOOL_MSG_TSCONFIG_SET``. +The ``ETHTOOL_A_TSCONFIG_TX_TYPES``, ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` and +``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` netlink attributes are then used to set +the struct hwtstamp_config accordingly. + +The ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` netlink nested attribute is used +to select the source of the hardware time stamping. It is composed of an index +for the device source and a qualifier for the type of time stamping. Drivers are free to use a more permissive configuration than the requested configuration. It is expected that drivers should only implement directly the @@ -563,9 +565,16 @@ Only a processes with admin rights may change the configuration. User space is responsible to ensure that multiple processes don't interfere with each other and that the settings are reset. -Any process can read the actual configuration by passing this -structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has -not been implemented in all drivers. +Any process can read the actual configuration by requesting tsconfig netlink +socket ``ETHTOOL_MSG_TSCONFIG_GET``. + +The legacy configuration is the use of the ioctl(SIOCSHWTSTAMP) with a pointer +to a struct ifreq whose ifr_data points to a struct hwtstamp_config. +The tx_type and rx_filter are hints to the driver what it is expected to do. +If the requested fine-grained filtering for incoming packets is not +supported, the driver may time stamp more than just the requested types +of packets. ioctl(SIOCGHWTSTAMP) is used in the same way as the +ioctl(SIOCSHWTSTAMP). However, this has not been implemented in all drivers. :: @@ -610,9 +619,10 @@ not been implemented in all drivers. -------------------------------------------------------- A driver which supports hardware time stamping must support the -SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with -the actual values as described in the section on SIOCSHWTSTAMP. It -should also support SIOCGHWTSTAMP. +ndo_hwtstamp_set NDO or the legacy SIOCSHWTSTAMP ioctl and update the +supplied struct hwtstamp_config with the actual values as described in +the section on SIOCSHWTSTAMP. It should also support ndo_hwtstamp_get or +the legacy SIOCGHWTSTAMP. Time stamps for received packets must be stored in the skb. To get a pointer to the shared time stamp structure of the skb call skb_hwtstamps(). Then diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 658ed3a71e1b..c7904a1bc167 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -200,6 +200,32 @@ received without a cmsg buffer set. recv will never return data from mixed types of TLS records. +TLS 1.3 Key Updates +------------------- + +In TLS 1.3, KeyUpdate handshake messages signal that the sender is +updating its TX key. Any message sent after a KeyUpdate will be +encrypted using the new key. The userspace library can pass the new +key to the kernel using the TLS_TX and TLS_RX socket options, as for +the initial keys. TLS version and cipher cannot be changed. + +To prevent attempting to decrypt incoming records using the wrong key, +decryption will be paused when a KeyUpdate message is received by the +kernel, until the new key has been provided using the TLS_RX socket +option. Any read occurring after the KeyUpdate has been read and +before the new key is provided will fail with EKEYEXPIRED. poll() will +not report any read events from the socket until the new key is +provided. There is no pausing on the transmit side. + +Userspace should make sure that the crypto_info provided has been set +properly. In particular, the kernel will not check for key/nonce +reuse. + +The number of successful and failed key updates is tracked in the +``TlsTxRekeyOk``, ``TlsRxRekeyOk``, ``TlsTxRekeyError``, +``TlsRxRekeyError`` statistics. The ``TlsRxRekeyReceived`` statistic +counts KeyUpdate handshake messages that have been received. + Integrating in to userspace TLS library --------------------------------------- @@ -286,3 +312,13 @@ TLS implementation exposes the following per-namespace statistics - ``TlsRxNoPadViolation`` - number of data RX records which had to be re-decrypted due to ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. + +- ``TlsTxRekeyOk``, ``TlsRxRekeyOk`` - + number of successful rekeys on existing sessions for TX and RX + +- ``TlsTxRekeyError``, ``TlsRxRekeyError`` - + number of failed rekeys on existing sessions for TX and RX + +- ``TlsRxRekeyReceived`` - + number of received KeyUpdate handshake messages, requiring userspace + to provide a new RX key diff --git a/Documentation/userspace-api/netlink/c-code-gen.rst b/Documentation/userspace-api/netlink/c-code-gen.rst index 89de42c13350..46415e6d646d 100644 --- a/Documentation/userspace-api/netlink/c-code-gen.rst +++ b/Documentation/userspace-api/netlink/c-code-gen.rst @@ -56,7 +56,9 @@ If ``name-prefix`` is specified it replaces the ``$family-$enum`` portion of the entry name. Boolean ``render-max`` controls creation of the max values -(which are enabled by default for attribute enums). +(which are enabled by default for attribute enums). These max +values are named ``__$pfx-MAX`` and ``$pfx-MAX``. The name +of the first value can be overridden via ``enum-cnt-name`` property. Attributes ========== diff --git a/MAINTAINERS b/MAINTAINERS index 7e29c4e86db1..e67e0c188349 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2836,6 +2836,13 @@ S: Maintained F: arch/arm64/boot/dts/freescale/s32g*.dts* F: drivers/pinctrl/nxp/ +ARM/NXP S32G/S32R DWMAC ETHERNET DRIVER +M: Jan Petrous <jan.petrous@oss.nxp.com> +L: NXP S32 Linux Team <s32@nxp.com> +S: Maintained +F: Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml +F: drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c + ARM/Orion SoC/Technologic Systems TS-78xx platform support M: Alexander Clouter <alex@digriz.org.uk> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -10280,7 +10287,6 @@ F: drivers/input/touchscreen/himax_hx83112b.c HIPPI M: Jes Sorensen <jes@trained-monkey.org> -L: linux-hippi@sunsite.dk S: Maintained F: drivers/net/hippi/ F: include/linux/hippidevice.h @@ -13944,6 +13950,7 @@ M: Sunil Goutham <sgoutham@marvell.com> M: Geetha sowjanya <gakula@marvell.com> M: Subbaraya Sundeep <sbhatta@marvell.com> M: hariprasad <hkelam@marvell.com> +M: Bharat Bhushan <bbhushan2@marvell.com> L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/marvell/octeontx2/nic/ @@ -16282,7 +16289,7 @@ F: include/linux/inetdevice.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h -F: include/uapi/linux/ethtool_netlink.h +F: include/uapi/linux/ethtool_netlink* F: include/uapi/linux/if_* F: include/uapi/linux/net_shaper.h F: include/uapi/linux/netdev* @@ -17640,6 +17647,7 @@ F: Documentation/core-api/packing.rst F: include/linux/packing.h F: lib/packing.c F: lib/packing_test.c +F: scripts/gen_packed_field_checks.c PADATA PARALLEL EXECUTION MECHANISM M: Steffen Klassert <steffen.klassert@secunet.com> @@ -1367,6 +1367,10 @@ PHONY += scripts_unifdef scripts_unifdef: scripts_basic $(Q)$(MAKE) $(build)=scripts scripts/unifdef +PHONY += scripts_gen_packed_field_checks +scripts_gen_packed_field_checks: scripts_basic + $(Q)$(MAKE) $(build)=scripts scripts/gen_packed_field_checks + # --------------------------------------------------------------------------- # Install diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 302507bf9b5d..3df5f2dd4c0f 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -148,6 +148,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d118d4731580..22fa8f19924a 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -159,6 +159,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index d268d69bfcd2..aa9cd4b951fe 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -140,6 +140,8 @@ #define SCM_TS_OPT_ID 0x404C +#define SO_RCVPRIORITY 0x404D + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 113cd9f353e3..5b464a568664 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -141,6 +141,8 @@ #define SCM_TS_OPT_ID 0x005a +#define SO_RCVPRIORITY 0x005b + #if !defined(__KERNEL__) diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c index e34a7a46754e..8ec2d4d4f135 100644 --- a/drivers/isdn/mISDN/core.c +++ b/drivers/isdn/mISDN/core.c @@ -294,20 +294,6 @@ get_Bprotocol4mask(u_int m) return NULL; } -struct Bprotocol * -get_Bprotocol4id(u_int id) -{ - u_int m; - - if (id < ISDN_P_B_START || id > 63) { - printk(KERN_WARNING "%s id not in range %d\n", - __func__, id); - return NULL; - } - m = 1 << (id & ISDN_P_B_MASK); - return get_Bprotocol4mask(m); -} - int mISDN_register_Bprotocol(struct Bprotocol *bp) { diff --git a/drivers/isdn/mISDN/core.h b/drivers/isdn/mISDN/core.h index 42599f49c189..5617c06de8e4 100644 --- a/drivers/isdn/mISDN/core.h +++ b/drivers/isdn/mISDN/core.h @@ -55,7 +55,6 @@ extern void __add_layer2(struct mISDNchannel *, struct mISDNstack *); extern u_int get_all_Bprotocols(void); struct Bprotocol *get_Bprotocol4mask(u_int); -struct Bprotocol *get_Bprotocol4id(u_int); extern int mISDN_inittimer(u_int *); extern void mISDN_timer_cleanup(void); diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index a2abfade82dd..70814303aab8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -84,7 +84,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } ipversion >>= 4; @@ -94,7 +94,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { @@ -108,7 +108,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else { @@ -124,7 +124,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } @@ -136,7 +136,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } @@ -144,7 +144,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } skb_dst_set(skb, &tun_dst->dst); @@ -194,7 +194,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) - dev_sw_netstats_rx_add(bareudp->dev, len); + dev_dstats_rx_add(bareudp->dev, len); return 0; drop: @@ -589,7 +589,7 @@ static void bareudp_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/drivers/net/can/sja1000/sja1000_platform.c b/drivers/net/can/sja1000/sja1000_platform.c index c42ebe9da55a..2d555f854008 100644 --- a/drivers/net/can/sja1000/sja1000_platform.c +++ b/drivers/net/can/sja1000/sja1000_platform.c @@ -230,18 +230,9 @@ static int sp_probe(struct platform_device *pdev) return -ENODEV; } - res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res_mem) - return -ENODEV; - - if (!devm_request_mem_region(&pdev->dev, res_mem->start, - resource_size(res_mem), DRV_NAME)) - return -EBUSY; - - addr = devm_ioremap(&pdev->dev, res_mem->start, - resource_size(res_mem)); - if (!addr) - return -ENOMEM; + addr = devm_platform_get_and_ioremap_resource(pdev, 0, &res_mem); + if (IS_ERR(addr)) + return PTR_ERR(addr); if (of) { irq = platform_get_irq(pdev, 0); diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 285785c942b0..0561b60f668f 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2224,13 +2224,16 @@ int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy) } EXPORT_SYMBOL(b53_eee_init); -int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) +bool b53_support_eee(struct dsa_switch *ds, int port) { struct b53_device *dev = ds->priv; - if (is5325(dev) || is5365(dev)) - return -EOPNOTSUPP; + return !is5325(dev) && !is5365(dev); +} +EXPORT_SYMBOL(b53_support_eee); +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) +{ return 0; } EXPORT_SYMBOL(b53_get_mac_eee); @@ -2240,9 +2243,6 @@ int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) struct b53_device *dev = ds->priv; struct ethtool_keee *p = &dev->ports[port].eee; - if (is5325(dev) || is5365(dev)) - return -EOPNOTSUPP; - p->eee_enabled = e->eee_enabled; b53_eee_enable_set(ds, port, e->eee_enabled); @@ -2298,6 +2298,7 @@ static const struct dsa_switch_ops b53_switch_ops = { .phylink_get_caps = b53_phylink_get_caps, .port_enable = b53_enable_port, .port_disable = b53_disable_port, + .support_eee = b53_support_eee, .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 05141176daf5..99e5cfc98ae8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -384,6 +384,7 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); void b53_disable_port(struct dsa_switch *ds, int port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); +bool b53_support_eee(struct dsa_switch *ds, int port); int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 43bde1f583ff..a53fb6191e6b 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1232,6 +1232,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_wol = bcm_sf2_sw_set_wol, .port_enable = bcm_sf2_port_setup, .port_disable = bcm_sf2_port_disable, + .support_eee = b53_support_eee, .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 8a03baa6aecc..df314724e6a7 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2544,7 +2544,11 @@ static int ksz_mdio_register(struct ksz_device *dev) bus->read = ksz_sw_mdio_read; bus->write = ksz_sw_mdio_write; bus->name = "ksz user smi"; - snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index); + if (ds->dst->index != 0) { + snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d-%d", ds->dst->index, ds->index); + } else { + snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index); + } } ret = ksz_parse_dt_phy_config(dev, bus, mdio_np); @@ -3444,12 +3448,12 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port) return -EOPNOTSUPP; } -static int ksz_validate_eee(struct dsa_switch *ds, int port) +static bool ksz_support_eee(struct dsa_switch *ds, int port) { struct ksz_device *dev = ds->priv; if (!dev->info->internal_phy[port]) - return -EOPNOTSUPP; + return false; switch (dev->chip_id) { case KSZ8563_CHIP_ID: @@ -3461,21 +3465,15 @@ static int ksz_validate_eee(struct dsa_switch *ds, int port) case KSZ9896_CHIP_ID: case KSZ9897_CHIP_ID: case LAN9646_CHIP_ID: - return 0; + return true; } - return -EOPNOTSUPP; + return false; } static int ksz_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { - int ret; - - ret = ksz_validate_eee(ds, port); - if (ret) - return ret; - /* There is no documented control of Tx LPI configuration. */ e->tx_lpi_enabled = true; @@ -3491,11 +3489,6 @@ static int ksz_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { struct ksz_device *dev = ds->priv; - int ret; - - ret = ksz_validate_eee(ds, port); - if (ret) - return ret; if (!e->tx_lpi_enabled) { dev_err(dev->dev, "Disabling EEE Tx LPI is not supported\n"); @@ -4641,6 +4634,7 @@ static const struct dsa_switch_ops ksz_switch_ops = { .cls_flower_add = ksz_cls_flower_add, .cls_flower_del = ksz_cls_flower_del, .port_setup_tc = ksz_setup_tc, + .support_eee = ksz_support_eee, .get_mac_eee = ksz_get_mac_eee, .set_mac_eee = ksz_set_mac_eee, .port_get_default_prio = ksz_port_get_default_prio, diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 086b8b3d5b40..9605febd3573 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -3238,6 +3238,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .port_mirror_add = mt753x_port_mirror_add, .port_mirror_del = mt753x_port_mirror_del, .phylink_get_caps = mt753x_phylink_get_caps, + .support_eee = dsa_supports_eee, .get_mac_eee = mt753x_get_mac_eee, .set_mac_eee = mt753x_set_mac_eee, .conduit_state_change = mt753x_conduit_state_change, diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3a792f79270d..570c8642d387 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1289,9 +1289,6 @@ static size_t mv88e6095_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_PORT))) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, MV88E6XXX_G1_STATS_OP_HIST_RX); return 1; @@ -1301,9 +1298,6 @@ static size_t mv88e6250_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & STATS_TYPE_BANK0)) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, MV88E6XXX_G1_STATS_OP_HIST_RX); return 1; @@ -1313,9 +1307,6 @@ static size_t mv88e6320_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_BANK1))) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, MV88E6XXX_G1_STATS_OP_BANK_1_BIT_9, MV88E6XXX_G1_STATS_OP_HIST_RX); @@ -1326,9 +1317,6 @@ static size_t mv88e6390_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_BANK1))) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, MV88E6XXX_G1_STATS_OP_BANK_1_BIT_10, 0); @@ -1341,6 +1329,9 @@ static size_t mv88e6xxx_stats_get_stat(struct mv88e6xxx_chip *chip, int port, { int ret = 0; + if (!(stat->type & chip->info->stats_type)) + return 0; + if (chip->info->ops->stats_get_stat) { mv88e6xxx_reg_lock(chip); ret = chip->info->ops->stats_get_stat(chip, port, stat, data); @@ -5645,6 +5636,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 5, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ops = &mv88e6250_ops, @@ -5665,6 +5657,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 5, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ops = &mv88e6250_ops, @@ -5687,6 +5680,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5708,6 +5702,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 8, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .ops = &mv88e6095_ops, @@ -5730,6 +5725,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5754,6 +5750,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5776,6 +5773,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 9, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .ops = &mv88e6131_ops, @@ -5800,6 +5798,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .atu_move_port_mask = 0x1f, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -5823,6 +5822,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5848,6 +5848,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5872,6 +5873,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5897,6 +5899,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5921,6 +5924,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5946,6 +5950,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5968,6 +5973,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 8, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -5992,6 +5998,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .atu_move_port_mask = 0x1f, @@ -6016,6 +6023,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6039,6 +6047,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6063,6 +6072,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6087,6 +6097,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6114,6 +6125,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ptp_support = true, @@ -6138,6 +6150,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6161,6 +6174,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ptp_support = true, @@ -6184,6 +6198,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6208,6 +6223,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6233,6 +6249,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -6259,6 +6276,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .atu_move_port_mask = 0x1f, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -6283,6 +6301,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6307,6 +6326,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6332,6 +6352,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6359,6 +6380,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6383,6 +6405,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6408,6 +6431,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6433,6 +6457,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -7074,6 +7099,7 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .get_sset_count = mv88e6xxx_get_sset_count, .port_max_mtu = mv88e6xxx_get_max_mtu, .port_change_mtu = mv88e6xxx_change_mtu, + .support_eee = dsa_supports_eee, .get_mac_eee = mv88e6xxx_get_mac_eee, .set_mac_eee = mv88e6xxx_set_mac_eee, .get_eeprom_len = mv88e6xxx_get_eeprom_len, diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 9fe8e8a7856b..86bf113c9bfa 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -144,6 +144,7 @@ struct mv88e6xxx_info { unsigned int age_time_coeff; unsigned int g1_irqs; unsigned int g2_irqs; + int stats_type; bool pvt; /* Mark certain ports as invalid. This is required for example for the diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 59b4a7240b58..90e24bc00b99 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -342,7 +342,7 @@ static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) dev_queue_xmit(skb); ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done, - msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT)); + QCA8K_ETHERNET_TIMEOUT); *val = mgmt_eth_data->data[0]; if (len > QCA_HDR_MGMT_DATA1_LEN) @@ -394,7 +394,7 @@ static int qca8k_write_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) dev_queue_xmit(skb); ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done, - msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT)); + QCA8K_ETHERNET_TIMEOUT); ack = mgmt_eth_data->ack; @@ -2016,6 +2016,7 @@ static const struct dsa_switch_ops qca8k_switch_ops = { .get_ethtool_stats = qca8k_get_ethtool_stats, .get_sset_count = qca8k_get_sset_count, .set_ageing_time = qca8k_set_ageing_time, + .support_eee = dsa_supports_eee, .get_mac_eee = qca8k_get_mac_eee, .set_mac_eee = qca8k_set_mac_eee, .port_enable = qca8k_port_enable, diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h index 3664a2e2f1f6..24962a395754 100644 --- a/drivers/net/dsa/qca/qca8k.h +++ b/drivers/net/dsa/qca/qca8k.h @@ -16,7 +16,7 @@ #define QCA8K_ETHERNET_MDIO_PRIORITY 7 #define QCA8K_ETHERNET_PHY_PRIORITY 6 -#define QCA8K_ETHERNET_TIMEOUT 5 +#define QCA8K_ETHERNET_TIMEOUT msecs_to_jiffies(5) #define QCA8K_NUM_PORTS 7 #define QCA8K_NUM_CPU_PORTS 2 diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index baba204ad62f..3d790f8c6f4d 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -26,12 +26,8 @@ void sja1105_pack(void *buf, const u64 *val, int start, int end, size_t len) pr_err("Start bit (%d) expected to be larger than end (%d)\n", start, end); } else if (rc == -ERANGE) { - if ((start - end + 1) > 64) - pr_err("Field %d-%d too large for 64 bits!\n", - start, end); - else - pr_err("Cannot store %llx inside bits %d-%d (would truncate)\n", - *val, start, end); + pr_err("Field %d-%d too large for 64 bits!\n", + start, end); } dump_stack(); } diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index 9a542e3c9b05..977b42bc1e8c 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -159,7 +159,7 @@ config ETHOC Say Y here if you want to use the OpenCores 10/100 Mbps Ethernet MAC. config OA_TC6 - tristate "OPEN Alliance TC6 10BASE-T1x MAC-PHY support" + tristate "OPEN Alliance TC6 10BASE-T1x MAC-PHY support" if COMPILE_TEST depends on SPI select PHYLIB help diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 63c8a2328142..c1295dfad0d0 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -74,7 +74,7 @@ static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue) if (threshold < time_since_last_napi && napi_scheduled) { netdev_err(dev, "napi handler hasn't been called for a long time but is scheduled\n"); - reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; } schedule_reset: /* Change the state of the device to trigger reset diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index e641dbbea1e2..b854b6b42d77 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -421,18 +421,12 @@ static void xgene_enet_configure_clock(struct xgene_enet_pdata *pdata) if (dev->of_node) { struct clk *parent = clk_get_parent(pdata->clk); + long rate = rgmii_clock(pdata->phy_speed); - switch (pdata->phy_speed) { - case SPEED_10: - clk_set_rate(parent, 2500000); - break; - case SPEED_100: - clk_set_rate(parent, 25000000); - break; - default: - clk_set_rate(parent, 125000000); - break; - } + if (rate < 0) + rate = 125000000; + + clk_set_rate(parent, rate); } #ifdef CONFIG_ACPI else { diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index daa416fb1724..640f500f989d 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -530,19 +530,9 @@ static void macb_set_tx_clk(struct macb *bp, int speed) if (bp->phy_interface == PHY_INTERFACE_MODE_MII) return; - switch (speed) { - case SPEED_10: - rate = 2500000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) return; - } rate_rounded = clk_round_rate(bp->tx_clk, rate); if (rate_rounded < 0) diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index 75401d2a5fb4..a2d7300925a8 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -81,8 +81,7 @@ config UCC_GETH tristate "Freescale QE Gigabit Ethernet" depends on QUICC_ENGINE && PPC32 select FSL_PQ_MDIO - select PHYLIB - select FIXED_PHY + select PHYLINK help This driver supports the Gigabit Ethernet mode of the QUICC Engine, which is available on some Freescale SOCs. diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index bf5baef5c3e0..4948b4906584 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2281,7 +2281,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, new_xdpf->len = xdpf->len; new_xdpf->headroom = priv->tx_headroom; new_xdpf->frame_sz = DPAA_BP_RAW_SIZE; - new_xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + new_xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; /* Release the initial buffer */ xdp_return_frame_rx_napi(xdpf); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index a293b08f36d4..147a93bf9fa9 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -780,13 +780,14 @@ struct ethsw_dump_ctx { static int dpaa2_switch_fdb_dump_nl(struct fdb_dump_entry *entry, struct ethsw_dump_ctx *dump) { + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; int is_dynamic = entry->type & DPSW_FDB_ENTRY_DINAMIC; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 1cca0425d493..c81f2ea588f2 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -671,8 +671,6 @@ struct fec_enet_private { unsigned int tx_time_itr; unsigned int itr_clk_rate; - /* tx lpi eee mode */ - struct ethtool_keee eee; unsigned int clk_ref_rate; /* ptp clock period in ns*/ diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1b55047c0237..b2daed55bf6c 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2045,14 +2045,14 @@ static int fec_enet_us_to_tx_cycle(struct net_device *ndev, int us) return us * (fep->clk_ref_rate / 1000) / 1000; } -static int fec_enet_eee_mode_set(struct net_device *ndev, bool enable) +static int fec_enet_eee_mode_set(struct net_device *ndev, u32 lpi_timer, + bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; unsigned int sleep_cycle, wake_cycle; if (enable) { - sleep_cycle = fec_enet_us_to_tx_cycle(ndev, p->tx_lpi_timer); + sleep_cycle = fec_enet_us_to_tx_cycle(ndev, lpi_timer); wake_cycle = sleep_cycle; } else { sleep_cycle = 0; @@ -2105,7 +2105,9 @@ static void fec_enet_adjust_link(struct net_device *ndev) napi_enable(&fep->napi); } if (fep->quirks & FEC_QUIRK_HAS_EEE) - fec_enet_eee_mode_set(ndev, phy_dev->enable_tx_lpi); + fec_enet_eee_mode_set(ndev, + phy_dev->eee_cfg.tx_lpi_timer, + phy_dev->enable_tx_lpi); } else { if (fep->link) { netif_stop_queue(ndev); @@ -3181,7 +3183,6 @@ static int fec_enet_get_eee(struct net_device *ndev, struct ethtool_keee *edata) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; if (!(fep->quirks & FEC_QUIRK_HAS_EEE)) return -EOPNOTSUPP; @@ -3189,8 +3190,6 @@ fec_enet_get_eee(struct net_device *ndev, struct ethtool_keee *edata) if (!netif_running(ndev)) return -ENETDOWN; - edata->tx_lpi_timer = p->tx_lpi_timer; - return phy_ethtool_get_eee(ndev->phydev, edata); } @@ -3198,7 +3197,6 @@ static int fec_enet_set_eee(struct net_device *ndev, struct ethtool_keee *edata) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; if (!(fep->quirks & FEC_QUIRK_HAS_EEE)) return -EOPNOTSUPP; @@ -3206,8 +3204,6 @@ fec_enet_set_eee(struct net_device *ndev, struct ethtool_keee *edata) if (!netif_running(ndev)) return -ENETDOWN; - p->tx_lpi_timer = edata->tx_lpi_timer; - return phy_ethtool_set_eee(ndev->phydev, edata); } diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index fb416d60dcd7..11887458f050 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -2690,13 +2690,12 @@ static struct fman *read_dts_node(struct platform_device *of_dev) { struct fman *fman; struct device_node *fm_node, *muram_node; + void __iomem *base_addr; struct resource *res; u32 val, range[2]; int err, irq; struct clk *clk; u32 clk_rate; - phys_addr_t phys_base_addr; - resource_size_t mem_size; fman = kzalloc(sizeof(*fman), GFP_KERNEL); if (!fman) @@ -2724,18 +2723,6 @@ static struct fman *read_dts_node(struct platform_device *of_dev) goto fman_node_put; fman->dts_params.err_irq = err; - /* Get the FM address */ - res = platform_get_resource(of_dev, IORESOURCE_MEM, 0); - if (!res) { - err = -EINVAL; - dev_err(&of_dev->dev, "%s: Can't get FMan memory resource\n", - __func__); - goto fman_node_put; - } - - phys_base_addr = res->start; - mem_size = resource_size(res); - clk = of_clk_get(fm_node, 0); if (IS_ERR(clk)) { err = PTR_ERR(clk); @@ -2803,24 +2790,16 @@ static struct fman *read_dts_node(struct platform_device *of_dev) } } - fman->dts_params.res = - devm_request_mem_region(&of_dev->dev, phys_base_addr, - mem_size, "fman"); - if (!fman->dts_params.res) { - err = -EBUSY; - dev_err(&of_dev->dev, "%s: request_mem_region() failed\n", - __func__); - goto fman_free; - } - - fman->dts_params.base_addr = - devm_ioremap(&of_dev->dev, phys_base_addr, mem_size); - if (!fman->dts_params.base_addr) { - err = -ENOMEM; + base_addr = devm_platform_get_and_ioremap_resource(of_dev, 0, &res); + if (IS_ERR(base_addr)) { + err = PTR_ERR(base_addr); dev_err(&of_dev->dev, "%s: devm_ioremap() failed\n", __func__); goto fman_free; } + fman->dts_params.base_addr = base_addr; + fman->dts_params.res = res; + fman->dev = &of_dev->dev; err = of_platform_populate(fm_node, NULL, NULL, &of_dev->dev); diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 6663c1768089..f47f8177a93b 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -26,7 +26,7 @@ #include <linux/dma-mapping.h> #include <linux/mii.h> #include <linux/phy.h> -#include <linux/phy_fixed.h> +#include <linux/phylink.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/of_address.h> @@ -34,6 +34,7 @@ #include <linux/of_mdio.h> #include <linux/of_net.h> #include <linux/platform_device.h> +#include <linux/rtnetlink.h> #include <linux/uaccess.h> #include <asm/irq.h> @@ -132,7 +133,6 @@ static const struct ucc_geth_info ugeth_primary_info = { .transmitFlowControl = 1, .maxGroupAddrInHash = 4, .maxIndAddrInHash = 4, - .prel = 7, .maxFrameLength = 1518+16, /* Add extra bytes for VLANs etc. */ .minFrameLength = 64, .maxD1Length = 1520+16, /* Add extra bytes for VLANs etc. */ @@ -1205,34 +1205,6 @@ static int init_mac_station_addr_regs(u8 address_byte_0, return 0; } -static int init_check_frame_length_mode(int length_check, - u32 __iomem *maccfg2_register) -{ - u32 value = 0; - - value = in_be32(maccfg2_register); - - if (length_check) - value |= MACCFG2_LC; - else - value &= ~MACCFG2_LC; - - out_be32(maccfg2_register, value); - return 0; -} - -static int init_preamble_length(u8 preamble_length, - u32 __iomem *maccfg2_register) -{ - if ((preamble_length < 3) || (preamble_length > 7)) - return -EINVAL; - - clrsetbits_be32(maccfg2_register, MACCFG2_PREL_MASK, - preamble_length << MACCFG2_PREL_SHIFT); - - return 0; -} - static int init_rx_parameters(int reject_broadcast, int receive_short_frames, int promiscuous, u32 __iomem *upsmr_register) @@ -1287,94 +1259,11 @@ static int init_min_frame_len(u16 min_frame_length, return 0; } -static int adjust_enet_interface(struct ucc_geth_private *ugeth) +static bool phy_interface_mode_is_reduced(phy_interface_t interface) { - struct ucc_geth_info *ug_info; - struct ucc_geth __iomem *ug_regs; - struct ucc_fast __iomem *uf_regs; - int ret_val; - u32 upsmr, maccfg2; - u16 value; - - ugeth_vdbg("%s: IN", __func__); - - ug_info = ugeth->ug_info; - ug_regs = ugeth->ug_regs; - uf_regs = ugeth->uccf->uf_regs; - - /* Set MACCFG2 */ - maccfg2 = in_be32(&ug_regs->maccfg2); - maccfg2 &= ~MACCFG2_INTERFACE_MODE_MASK; - if ((ugeth->max_speed == SPEED_10) || - (ugeth->max_speed == SPEED_100)) - maccfg2 |= MACCFG2_INTERFACE_MODE_NIBBLE; - else if (ugeth->max_speed == SPEED_1000) - maccfg2 |= MACCFG2_INTERFACE_MODE_BYTE; - maccfg2 |= ug_info->padAndCrc; - out_be32(&ug_regs->maccfg2, maccfg2); - - /* Set UPSMR */ - upsmr = in_be32(&uf_regs->upsmr); - upsmr &= ~(UCC_GETH_UPSMR_RPM | UCC_GETH_UPSMR_R10M | - UCC_GETH_UPSMR_TBIM | UCC_GETH_UPSMR_RMM); - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_RMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_ID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - if (ugeth->phy_interface != PHY_INTERFACE_MODE_RMII) - upsmr |= UCC_GETH_UPSMR_RPM; - switch (ugeth->max_speed) { - case SPEED_10: - upsmr |= UCC_GETH_UPSMR_R10M; - fallthrough; - case SPEED_100: - if (ugeth->phy_interface != PHY_INTERFACE_MODE_RTBI) - upsmr |= UCC_GETH_UPSMR_RMM; - } - } - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_TBI) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - upsmr |= UCC_GETH_UPSMR_TBIM; - } - if (ugeth->phy_interface == PHY_INTERFACE_MODE_SGMII) - upsmr |= UCC_GETH_UPSMR_SGMM; - - out_be32(&uf_regs->upsmr, upsmr); - - /* Disable autonegotiation in tbi mode, because by default it - comes up in autonegotiation mode. */ - /* Note that this depends on proper setting in utbipar register. */ - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_TBI) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - struct ucc_geth_info *ug_info = ugeth->ug_info; - struct phy_device *tbiphy; - - if (!ug_info->tbi_node) - pr_warn("TBI mode requires that the device tree specify a tbi-handle\n"); - - tbiphy = of_phy_find_device(ug_info->tbi_node); - if (!tbiphy) - pr_warn("Could not get TBI device\n"); - - value = phy_read(tbiphy, ENET_TBI_MII_CR); - value &= ~0x1000; /* Turn off autonegotiation */ - phy_write(tbiphy, ENET_TBI_MII_CR, value); - - put_device(&tbiphy->mdio.dev); - } - - init_check_frame_length_mode(ug_info->lengthCheckRx, &ug_regs->maccfg2); - - ret_val = init_preamble_length(ug_info->prel, &ug_regs->maccfg2); - if (ret_val != 0) { - if (netif_msg_probe(ugeth)) - pr_err("Preamble length must be between 3 and 7 inclusive\n"); - return ret_val; - } - - return 0; + return phy_interface_mode_is_rgmii(interface) || + interface == PHY_INTERFACE_MODE_RMII || + interface == PHY_INTERFACE_MODE_RTBI; } static int ugeth_graceful_stop_tx(struct ucc_geth_private *ugeth) @@ -1548,107 +1437,6 @@ static void ugeth_activate(struct ucc_geth_private *ugeth) __netdev_watchdog_up(ugeth->ndev); } -/* Called every time the controller might need to be made - * aware of new link state. The PHY code conveys this - * information through variables in the ugeth structure, and this - * function converts those variables into the appropriate - * register values, and can bring down the device if needed. - */ - -static void adjust_link(struct net_device *dev) -{ - struct ucc_geth_private *ugeth = netdev_priv(dev); - struct ucc_geth __iomem *ug_regs; - struct ucc_fast __iomem *uf_regs; - struct phy_device *phydev = ugeth->phydev; - int new_state = 0; - - ug_regs = ugeth->ug_regs; - uf_regs = ugeth->uccf->uf_regs; - - if (phydev->link) { - u32 tempval = in_be32(&ug_regs->maccfg2); - u32 upsmr = in_be32(&uf_regs->upsmr); - /* Now we make sure that we can be in full duplex mode. - * If not, we operate in half-duplex mode. */ - if (phydev->duplex != ugeth->oldduplex) { - new_state = 1; - if (!(phydev->duplex)) - tempval &= ~(MACCFG2_FDX); - else - tempval |= MACCFG2_FDX; - ugeth->oldduplex = phydev->duplex; - } - - if (phydev->speed != ugeth->oldspeed) { - new_state = 1; - switch (phydev->speed) { - case SPEED_1000: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_BYTE); - break; - case SPEED_100: - case SPEED_10: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_NIBBLE); - /* if reduced mode, re-set UPSMR.R10M */ - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_RMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_ID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - if (phydev->speed == SPEED_10) - upsmr |= UCC_GETH_UPSMR_R10M; - else - upsmr &= ~UCC_GETH_UPSMR_R10M; - } - break; - default: - if (netif_msg_link(ugeth)) - pr_warn( - "%s: Ack! Speed (%d) is not 10/100/1000!", - dev->name, phydev->speed); - break; - } - ugeth->oldspeed = phydev->speed; - } - - if (!ugeth->oldlink) { - new_state = 1; - ugeth->oldlink = 1; - } - - if (new_state) { - /* - * To change the MAC configuration we need to disable - * the controller. To do so, we have to either grab - * ugeth->lock, which is a bad idea since 'graceful - * stop' commands might take quite a while, or we can - * quiesce driver's activity. - */ - ugeth_quiesce(ugeth); - ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); - - out_be32(&ug_regs->maccfg2, tempval); - out_be32(&uf_regs->upsmr, upsmr); - - ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); - ugeth_activate(ugeth); - } - } else if (ugeth->oldlink) { - new_state = 1; - ugeth->oldlink = 0; - ugeth->oldspeed = 0; - ugeth->oldduplex = -1; - } - - if (new_state && netif_msg_link(ugeth)) - phy_print_status(phydev); -} - /* Initialize TBI PHY interface for communicating with the * SERDES lynx PHY on the chip. We communicate with this PHY * through the MDIO bus on each controller, treating it as a @@ -1664,8 +1452,7 @@ static void uec_configure_serdes(struct net_device *dev) struct phy_device *tbiphy; if (!ug_info->tbi_node) { - dev_warn(&dev->dev, "SGMII mode requires that the device " - "tree specify a tbi-handle\n"); + dev_warn(&dev->dev, "SGMII mode requires that the device tree specify a tbi-handle\n"); return; } @@ -1696,34 +1483,145 @@ static void uec_configure_serdes(struct net_device *dev) put_device(&tbiphy->mdio.dev); } -/* Configure the PHY for dev. - * returns 0 if success. -1 if failure - */ -static int init_phy(struct net_device *dev) +static void ugeth_mac_link_up(struct phylink_config *config, struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, bool tx_pause, bool rx_pause) { - struct ucc_geth_private *priv = netdev_priv(dev); - struct ucc_geth_info *ug_info = priv->ug_info; - struct phy_device *phydev; + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); + struct ucc_geth_info *ug_info = ugeth->ug_info; + struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; + struct ucc_fast __iomem *uf_regs = ugeth->uccf->uf_regs; + u32 old_maccfg2, maccfg2 = in_be32(&ug_regs->maccfg2); + u32 old_upsmr, upsmr = in_be32(&uf_regs->upsmr); - priv->oldlink = 0; - priv->oldspeed = 0; - priv->oldduplex = -1; + old_maccfg2 = maccfg2; + old_upsmr = upsmr; - phydev = of_phy_connect(dev, ug_info->phy_node, &adjust_link, 0, - priv->phy_interface); - if (!phydev) { - dev_err(&dev->dev, "Could not attach to PHY\n"); - return -ENODEV; + /* No length check */ + maccfg2 &= ~MACCFG2_LC; + maccfg2 &= ~MACCFG2_INTERFACE_MODE_MASK; + upsmr &= ~(UCC_GETH_UPSMR_RPM | UCC_GETH_UPSMR_R10M | + UCC_GETH_UPSMR_TBIM | UCC_GETH_UPSMR_RMM); + + if (speed == SPEED_10 || speed == SPEED_100) + maccfg2 |= MACCFG2_INTERFACE_MODE_NIBBLE; + else if (speed == SPEED_1000) + maccfg2 |= MACCFG2_INTERFACE_MODE_BYTE; + + maccfg2 |= ug_info->padAndCrc; + + if (phy_interface_mode_is_reduced(interface)) { + + if (interface != PHY_INTERFACE_MODE_RMII) + upsmr |= UCC_GETH_UPSMR_RPM; + + switch (speed) { + case SPEED_10: + upsmr |= UCC_GETH_UPSMR_R10M; + fallthrough; + case SPEED_100: + if (interface != PHY_INTERFACE_MODE_RTBI) + upsmr |= UCC_GETH_UPSMR_RMM; + } } - if (priv->phy_interface == PHY_INTERFACE_MODE_SGMII) - uec_configure_serdes(dev); + if (interface == PHY_INTERFACE_MODE_TBI || + interface == PHY_INTERFACE_MODE_RTBI) + upsmr |= UCC_GETH_UPSMR_TBIM; - phy_set_max_speed(phydev, priv->max_speed); + if (interface == PHY_INTERFACE_MODE_SGMII) + upsmr |= UCC_GETH_UPSMR_SGMM; - priv->phydev = phydev; + if (duplex == DUPLEX_HALF) + maccfg2 &= ~(MACCFG2_FDX); + else + maccfg2 |= MACCFG2_FDX; - return 0; + if (maccfg2 != old_maccfg2 || upsmr != old_upsmr) { + /* + * To change the MAC configuration we need to disable + * the controller. To do so, we have to either grab + * ugeth->lock, which is a bad idea since 'graceful + * stop' commands might take quite a while, or we can + * quiesce driver's activity. + */ + ugeth_quiesce(ugeth); + ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); + + out_be32(&ug_regs->maccfg2, maccfg2); + out_be32(&uf_regs->upsmr, upsmr); + + ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); + ugeth_activate(ugeth); + } + + if (interface == PHY_INTERFACE_MODE_SGMII) + uec_configure_serdes(ndev); + + if (!phylink_autoneg_inband(mode)) { + ug_info->aufc = 0; + ug_info->receiveFlowControl = rx_pause; + ug_info->transmitFlowControl = tx_pause; + + init_flow_control_params(ug_info->aufc, + ug_info->receiveFlowControl, + ug_info->transmitFlowControl, + ug_info->pausePeriod, + ug_info->extensionField, + &ugeth->uccf->uf_regs->upsmr, + &ugeth->ug_regs->uempr, + &ugeth->ug_regs->maccfg1); + } + + ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); +} + +static void ugeth_mac_link_down(struct phylink_config *config, + unsigned int mode, phy_interface_t interface) +{ + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); + + ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); +} + +static void ugeth_mac_config(struct phylink_config *config, unsigned int mode, + const struct phylink_link_state *state) +{ + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); + struct ucc_geth_info *ug_info = ugeth->ug_info; + u16 value; + + if (state->interface == PHY_INTERFACE_MODE_TBI || + state->interface == PHY_INTERFACE_MODE_RTBI) { + struct phy_device *tbiphy; + + if (!ug_info->tbi_node) + pr_warn("TBI mode requires that the device tree specify a tbi-handle\n"); + + tbiphy = of_phy_find_device(ug_info->tbi_node); + if (!tbiphy) + pr_warn("Could not get TBI device\n"); + + value = phy_read(tbiphy, ENET_TBI_MII_CR); + value &= ~0x1000; /* Turn off autonegotiation */ + phy_write(tbiphy, ENET_TBI_MII_CR, value); + + put_device(&tbiphy->mdio.dev); + } + + if (phylink_autoneg_inband(mode)) { + ug_info->aufc = 1; + + init_flow_control_params(ug_info->aufc, 1, 1, + ug_info->pausePeriod, + ug_info->extensionField, + &ugeth->uccf->uf_regs->upsmr, + &ugeth->ug_regs->uempr, + &ugeth->ug_regs->maccfg1); + } } static void ugeth_dump_regs(struct ucc_geth_private *ugeth) @@ -1995,7 +1893,6 @@ static void ucc_geth_set_multi(struct net_device *dev) static void ucc_geth_stop(struct ucc_geth_private *ugeth) { struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; - struct phy_device *phydev = ugeth->phydev; ugeth_vdbg("%s: IN", __func__); @@ -2004,7 +1901,7 @@ static void ucc_geth_stop(struct ucc_geth_private *ugeth) * Must be done before disabling the controller * or deadlock may happen. */ - phy_stop(phydev); + phylink_stop(ugeth->phylink); /* Disable the controller */ ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); @@ -3246,12 +3143,6 @@ static int ucc_geth_init_mac(struct ucc_geth_private *ugeth) goto err; } - err = adjust_enet_interface(ugeth); - if (err) { - netif_err(ugeth, ifup, dev, "Cannot configure net device, aborting\n"); - goto err; - } - /* Set MACSTNADDR1, MACSTNADDR2 */ /* For more details see the hardware spec. */ init_mac_station_addr_regs(dev->dev_addr[0], @@ -3263,12 +3154,6 @@ static int ucc_geth_init_mac(struct ucc_geth_private *ugeth) &ugeth->ug_regs->macstnaddr1, &ugeth->ug_regs->macstnaddr2); - err = ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); - if (err) { - netif_err(ugeth, ifup, dev, "Cannot enable net device, aborting\n"); - goto err; - } - return 0; err: ucc_geth_stop(ugeth); @@ -3291,10 +3176,10 @@ static int ucc_geth_open(struct net_device *dev) return -EINVAL; } - err = init_phy(dev); + err = phylink_of_phy_connect(ugeth->phylink, ugeth->dev->of_node, 0); if (err) { - netif_err(ugeth, ifup, dev, "Cannot initialize PHY, aborting\n"); - return err; + dev_err(&dev->dev, "Could not attach to PHY\n"); + return -ENODEV; } err = ucc_geth_init_mac(ugeth); @@ -3310,13 +3195,13 @@ static int ucc_geth_open(struct net_device *dev) goto err; } - phy_start(ugeth->phydev); + phylink_start(ugeth->phylink); napi_enable(&ugeth->napi); netdev_reset_queue(dev); netif_start_queue(dev); device_set_wakeup_capable(&dev->dev, - qe_alive_during_sleep() || ugeth->phydev->irq); + qe_alive_during_sleep() || dev->phydev->irq); device_set_wakeup_enable(&dev->dev, ugeth->wol_en); return err; @@ -3337,8 +3222,7 @@ static int ucc_geth_close(struct net_device *dev) cancel_work_sync(&ugeth->timeout_work); ucc_geth_stop(ugeth); - phy_disconnect(ugeth->phydev); - ugeth->phydev = NULL; + phylink_disconnect_phy(ugeth->phylink); free_irq(ugeth->ug_info->uf_info.irq, ugeth->ndev); @@ -3372,7 +3256,7 @@ static void ucc_geth_timeout_work(struct work_struct *work) ucc_geth_stop(ugeth); ucc_geth_init_mac(ugeth); /* Must start PHY here */ - phy_start(ugeth->phydev); + phylink_start(ugeth->phylink); netif_tx_start_all_queues(dev); } @@ -3397,6 +3281,7 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) { struct net_device *ndev = platform_get_drvdata(ofdev); struct ucc_geth_private *ugeth = netdev_priv(ndev); + bool mac_wol = false; if (!netif_running(ndev)) return 0; @@ -3410,14 +3295,17 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) */ ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); - if (ugeth->wol_en & WAKE_MAGIC) { + if (ugeth->wol_en & WAKE_MAGIC && !ugeth->phy_wol_en) { setbits32(ugeth->uccf->p_uccm, UCC_GETH_UCCE_MPD); setbits32(&ugeth->ug_regs->maccfg2, MACCFG2_MPE); ucc_fast_enable(ugeth->uccf, COMM_DIR_RX_AND_TX); - } else if (!(ugeth->wol_en & WAKE_PHY)) { - phy_stop(ugeth->phydev); + mac_wol = true; } + rtnl_lock(); + phylink_suspend(ugeth->phylink, mac_wol); + rtnl_unlock(); + return 0; } @@ -3451,12 +3339,9 @@ static int ucc_geth_resume(struct platform_device *ofdev) } } - ugeth->oldlink = 0; - ugeth->oldspeed = 0; - ugeth->oldduplex = -1; - - phy_stop(ugeth->phydev); - phy_start(ugeth->phydev); + rtnl_lock(); + phylink_resume(ugeth->phylink); + rtnl_unlock(); napi_enable(&ugeth->napi); netif_device_attach(ndev); @@ -3469,32 +3354,6 @@ static int ucc_geth_resume(struct platform_device *ofdev) #define ucc_geth_resume NULL #endif -static phy_interface_t to_phy_interface(const char *phy_connection_type) -{ - if (strcasecmp(phy_connection_type, "mii") == 0) - return PHY_INTERFACE_MODE_MII; - if (strcasecmp(phy_connection_type, "gmii") == 0) - return PHY_INTERFACE_MODE_GMII; - if (strcasecmp(phy_connection_type, "tbi") == 0) - return PHY_INTERFACE_MODE_TBI; - if (strcasecmp(phy_connection_type, "rmii") == 0) - return PHY_INTERFACE_MODE_RMII; - if (strcasecmp(phy_connection_type, "rgmii") == 0) - return PHY_INTERFACE_MODE_RGMII; - if (strcasecmp(phy_connection_type, "rgmii-id") == 0) - return PHY_INTERFACE_MODE_RGMII_ID; - if (strcasecmp(phy_connection_type, "rgmii-txid") == 0) - return PHY_INTERFACE_MODE_RGMII_TXID; - if (strcasecmp(phy_connection_type, "rgmii-rxid") == 0) - return PHY_INTERFACE_MODE_RGMII_RXID; - if (strcasecmp(phy_connection_type, "rtbi") == 0) - return PHY_INTERFACE_MODE_RTBI; - if (strcasecmp(phy_connection_type, "sgmii") == 0) - return PHY_INTERFACE_MODE_SGMII; - - return PHY_INTERFACE_MODE_MII; -} - static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct ucc_geth_private *ugeth = netdev_priv(dev); @@ -3502,10 +3361,7 @@ static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!netif_running(dev)) return -EINVAL; - if (!ugeth->phydev) - return -ENODEV; - - return phy_mii_ioctl(ugeth->phydev, rq, cmd); + return phylink_mii_ioctl(ugeth->phylink, rq, cmd); } static const struct net_device_ops ucc_geth_netdev_ops = { @@ -3513,7 +3369,6 @@ static const struct net_device_ops ucc_geth_netdev_ops = { .ndo_stop = ucc_geth_close, .ndo_start_xmit = ucc_geth_start_xmit, .ndo_validate_addr = eth_validate_addr, - .ndo_change_carrier = fixed_phy_change_carrier, .ndo_set_mac_address = ucc_geth_set_mac_addr, .ndo_set_rx_mode = ucc_geth_set_multi, .ndo_tx_timeout = ucc_geth_timeout, @@ -3553,6 +3408,12 @@ static int ucc_geth_parse_clock(struct device_node *np, const char *which, return 0; } +struct phylink_mac_ops ugeth_mac_ops = { + .mac_link_up = ugeth_mac_link_up, + .mac_link_down = ugeth_mac_link_down, + .mac_config = ugeth_mac_config, +}; + static int ucc_geth_probe(struct platform_device* ofdev) { struct device *device = &ofdev->dev; @@ -3560,23 +3421,12 @@ static int ucc_geth_probe(struct platform_device* ofdev) struct net_device *dev = NULL; struct ucc_geth_private *ugeth = NULL; struct ucc_geth_info *ug_info; + struct device_node *phy_node; + struct phylink *phylink; struct resource res; - int err, ucc_num, max_speed = 0; + int err, ucc_num; const unsigned int *prop; phy_interface_t phy_interface; - static const int enet_to_speed[] = { - SPEED_10, SPEED_10, SPEED_10, - SPEED_100, SPEED_100, SPEED_100, - SPEED_1000, SPEED_1000, SPEED_1000, SPEED_1000, - }; - static const phy_interface_t enet_to_phy_interface[] = { - PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_RMII, - PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_MII, - PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_RGMII, - PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_RGMII, - PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_RTBI, - PHY_INTERFACE_MODE_SGMII, - }; ugeth_vdbg("%s: IN", __func__); @@ -3612,57 +3462,35 @@ static int ucc_geth_probe(struct platform_device* ofdev) ug_info->uf_info.regs = res.start; ug_info->uf_info.irq = irq_of_parse_and_map(np, 0); - ug_info->phy_node = of_parse_phandle(np, "phy-handle", 0); - if (!ug_info->phy_node && of_phy_is_fixed_link(np)) { - /* - * In the case of a fixed PHY, the DT node associated - * to the PHY is the Ethernet MAC DT node. - */ - err = of_phy_register_fixed_link(np); - if (err) - return err; - ug_info->phy_node = of_node_get(np); - } - /* Find the TBI PHY node. If it's not there, we don't support SGMII */ ug_info->tbi_node = of_parse_phandle(np, "tbi-handle", 0); - /* get the phy interface type, or default to MII */ - prop = of_get_property(np, "phy-connection-type", NULL); - if (!prop) { - /* handle interface property present in old trees */ - prop = of_get_property(ug_info->phy_node, "interface", NULL); - if (prop != NULL) { - phy_interface = enet_to_phy_interface[*prop]; - max_speed = enet_to_speed[*prop]; - } else - phy_interface = PHY_INTERFACE_MODE_MII; - } else { - phy_interface = to_phy_interface((const char *)prop); - } - - /* get speed, or derive from PHY interface */ - if (max_speed == 0) - switch (phy_interface) { - case PHY_INTERFACE_MODE_GMII: - case PHY_INTERFACE_MODE_RGMII: - case PHY_INTERFACE_MODE_RGMII_ID: - case PHY_INTERFACE_MODE_RGMII_RXID: - case PHY_INTERFACE_MODE_RGMII_TXID: - case PHY_INTERFACE_MODE_TBI: - case PHY_INTERFACE_MODE_RTBI: - case PHY_INTERFACE_MODE_SGMII: - max_speed = SPEED_1000; - break; - default: - max_speed = SPEED_100; - break; + phy_node = of_parse_phandle(np, "phy-handle", 0); + if (phy_node) { + prop = of_get_property(phy_node, "interface", NULL); + if (prop) { + dev_err(&ofdev->dev, + "Device-tree property 'interface' is no longer supported. Please use 'phy-connection-type' instead."); + of_node_put(phy_node); + err = -EINVAL; + goto err_put_tbi; } + of_node_put(phy_node); + } + + err = of_get_phy_mode(np, &phy_interface); + if (err) { + dev_err(&ofdev->dev, "Invalid phy-connection-type"); + goto err_put_tbi; + } - if (max_speed == SPEED_1000) { + if (phy_interface == PHY_INTERFACE_MODE_GMII || + phy_interface_mode_is_rgmii(phy_interface) || + phy_interface == PHY_INTERFACE_MODE_TBI || + phy_interface == PHY_INTERFACE_MODE_RTBI || + phy_interface == PHY_INTERFACE_MODE_SGMII) { unsigned int snums = qe_get_num_of_snums(); - /* configure muram FIFOs for gigabit operation */ ug_info->uf_info.urfs = UCC_GETH_URFS_GIGA_INIT; ug_info->uf_info.urfet = UCC_GETH_URFET_GIGA_INIT; ug_info->uf_info.urfset = UCC_GETH_URFSET_GIGA_INIT; @@ -3691,7 +3519,7 @@ static int ucc_geth_probe(struct platform_device* ofdev) dev = devm_alloc_etherdev(&ofdev->dev, sizeof(*ugeth)); if (!dev) { err = -ENOMEM; - goto err_deregister_fixed_link; + goto err_put_tbi; } ugeth = netdev_priv(dev); @@ -3718,23 +3546,50 @@ static int ucc_geth_probe(struct platform_device* ofdev) dev->max_mtu = 1518; ugeth->msg_enable = netif_msg_init(debug.msg_enable, UGETH_MSG_DEFAULT); - ugeth->phy_interface = phy_interface; - ugeth->max_speed = max_speed; - /* Carrier starts down, phylib will bring it up */ - netif_carrier_off(dev); + ugeth->phylink_config.dev = &dev->dev; + ugeth->phylink_config.type = PHYLINK_NETDEV; + + ugeth->phylink_config.mac_capabilities = + MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; + + __set_bit(PHY_INTERFACE_MODE_MII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_RMII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + ugeth->phylink_config.supported_interfaces); + phy_interface_set_rgmii(ugeth->phylink_config.supported_interfaces); + + if (ug_info->tbi_node) { + __set_bit(PHY_INTERFACE_MODE_SGMII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_TBI, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_RTBI, + ugeth->phylink_config.supported_interfaces); + } + + phylink = phylink_create(&ugeth->phylink_config, dev_fwnode(&dev->dev), + phy_interface, &ugeth_mac_ops); + if (IS_ERR(phylink)) { + err = PTR_ERR(phylink); + goto err_put_tbi; + } + + ugeth->phylink = phylink; err = devm_register_netdev(&ofdev->dev, dev); if (err) { if (netif_msg_probe(ugeth)) pr_err("%s: Cannot register net device, aborting\n", dev->name); - goto err_deregister_fixed_link; + goto err_destroy_phylink; } err = of_get_ethdev_address(np, dev); if (err == -EPROBE_DEFER) - goto err_deregister_fixed_link; + goto err_destroy_phylink; ugeth->ug_info = ug_info; ugeth->dev = device; @@ -3743,11 +3598,11 @@ static int ucc_geth_probe(struct platform_device* ofdev) return 0; -err_deregister_fixed_link: - if (of_phy_is_fixed_link(np)) - of_phy_deregister_fixed_link(np); +err_destroy_phylink: + phylink_destroy(phylink); +err_put_tbi: of_node_put(ug_info->tbi_node); - of_node_put(ug_info->phy_node); + return err; } @@ -3755,13 +3610,10 @@ static void ucc_geth_remove(struct platform_device* ofdev) { struct net_device *dev = platform_get_drvdata(ofdev); struct ucc_geth_private *ugeth = netdev_priv(dev); - struct device_node *np = ofdev->dev.of_node; ucc_geth_memclean(ugeth); - if (of_phy_is_fixed_link(np)) - of_phy_deregister_fixed_link(np); + phylink_destroy(ugeth->phylink); of_node_put(ugeth->ug_info->tbi_node); - of_node_put(ugeth->ug_info->phy_node); } static const struct of_device_id ucc_geth_match[] = { diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index 4294ed096ebb..38789faae706 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -16,6 +16,7 @@ #include <linux/kernel.h> #include <linux/list.h> +#include <linux/phylink.h> #include <linux/if_ether.h> #include <soc/fsl/qe/immap_qe.h> @@ -921,7 +922,8 @@ struct ucc_geth_hardware_statistics { #define UCC_GETH_UPSMR_INIT UCC_GETH_UPSMR_RES1 #define UCC_GETH_MACCFG1_INIT 0 -#define UCC_GETH_MACCFG2_INIT (MACCFG2_RESERVED_1) +#define UCC_GETH_MACCFG2_INIT (MACCFG2_RESERVED_1 | \ + (7 << MACCFG2_PREL_SHIFT)) /* Ethernet Address Type. */ enum enet_addr_type { @@ -1073,6 +1075,9 @@ struct ucc_geth_tad_params { u16 vid; }; +struct phylink; +struct phylink_config; + /* GETH protocol initialization structure */ struct ucc_geth_info { struct ucc_fast_info uf_info; @@ -1088,7 +1093,6 @@ struct ucc_geth_info { u8 miminumInterFrameGapEnforcement; u8 backToBackInterFrameGap; int ipAddressAlignment; - int lengthCheckRx; u32 mblinterval; u16 nortsrbytetime; u8 fracsiz; @@ -1114,7 +1118,6 @@ struct ucc_geth_info { int transmitFlowControl; u8 maxGroupAddrInHash; u8 maxIndAddrInHash; - u8 prel; u16 maxFrameLength; u16 minFrameLength; u16 maxD1Length; @@ -1125,7 +1128,6 @@ struct ucc_geth_info { u32 eventRegMask; u16 pausePeriod; u16 extensionField; - struct device_node *phy_node; struct device_node *tbi_node; u8 weightfactor[NUM_TX_QUEUES]; u8 interruptcoalescingmaxvalue[NUM_RX_QUEUES]; @@ -1210,14 +1212,12 @@ struct ucc_geth_private { u16 skb_dirtytx[NUM_TX_QUEUES]; struct ugeth_mii_info *mii_info; - struct phy_device *phydev; - phy_interface_t phy_interface; - int max_speed; uint32_t msg_enable; - int oldspeed; - int oldduplex; - int oldlink; - int wol_en; + u32 wol_en; + u32 phy_wol_en; + + struct phylink *phylink; + struct phylink_config phylink_config; struct device_node *node; }; diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index 699f346faf5c..1fb49e5a414a 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -104,14 +104,8 @@ static int uec_get_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; - if (!phydev) - return -ENODEV; - - phy_ethtool_ksettings_get(phydev, cmd); - - return 0; + return phylink_ethtool_ksettings_get(ugeth->phylink, cmd); } static int @@ -119,12 +113,8 @@ uec_set_ksettings(struct net_device *netdev, const struct ethtool_link_ksettings *cmd) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; - if (!phydev) - return -ENODEV; - - return phy_ethtool_ksettings_set(phydev, cmd); + return phylink_ethtool_ksettings_set(ugeth->phylink, cmd); } static void @@ -133,12 +123,7 @@ uec_get_pauseparam(struct net_device *netdev, { struct ucc_geth_private *ugeth = netdev_priv(netdev); - pause->autoneg = ugeth->phydev->autoneg; - - if (ugeth->ug_info->receiveFlowControl) - pause->rx_pause = 1; - if (ugeth->ug_info->transmitFlowControl) - pause->tx_pause = 1; + return phylink_ethtool_get_pauseparam(ugeth->phylink, pause); } static int @@ -146,30 +131,11 @@ uec_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - int ret = 0; ugeth->ug_info->receiveFlowControl = pause->rx_pause; ugeth->ug_info->transmitFlowControl = pause->tx_pause; - if (ugeth->phydev->autoneg) { - if (netif_running(netdev)) { - /* FIXME: automatically restart */ - netdev_info(netdev, "Please re-open the interface\n"); - } - } else { - struct ucc_geth_info *ug_info = ugeth->ug_info; - - ret = init_flow_control_params(ug_info->aufc, - ug_info->receiveFlowControl, - ug_info->transmitFlowControl, - ug_info->pausePeriod, - ug_info->extensionField, - &ugeth->uccf->uf_regs->upsmr, - &ugeth->ug_regs->uempr, - &ugeth->ug_regs->maccfg1); - } - - return ret; + return phylink_ethtool_set_pauseparam(ugeth->phylink, pause); } static uint32_t @@ -343,28 +309,42 @@ uec_get_drvinfo(struct net_device *netdev, static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; - if (phydev && phydev->irq) - wol->supported |= WAKE_PHY; + phylink_ethtool_get_wol(ugeth->phylink, wol); + if (qe_alive_during_sleep()) wol->supported |= WAKE_MAGIC; - wol->wolopts = ugeth->wol_en; + wol->wolopts |= ugeth->wol_en; } static int uec_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; + int ret = 0; - if (wol->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) - return -EINVAL; - else if (wol->wolopts & WAKE_PHY && (!phydev || !phydev->irq)) + ret = phylink_ethtool_set_wol(ugeth->phylink, wol); + if (ret == -EOPNOTSUPP) { + ugeth->phy_wol_en = 0; + } else if (ret) { + return ret; + } else { + ugeth->phy_wol_en = wol->wolopts; + goto out; + } + + /* If the PHY isn't handling the WoL and the MAC is asked to more than + * WAKE_MAGIC, error-out + */ + if (!ugeth->phy_wol_en && + wol->wolopts & ~WAKE_MAGIC) return -EINVAL; - else if (wol->wolopts & WAKE_MAGIC && !qe_alive_during_sleep()) + + if (wol->wolopts & WAKE_MAGIC && + !qe_alive_during_sleep()) return -EINVAL; +out: ugeth->wol_en = wol->wolopts; device_set_wakeup_enable(&netdev->dev, ugeth->wol_en); diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 060e0e674938..aa7d723011d0 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1128,20 +1128,6 @@ int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id) return gve_adminq_execute_cmd(priv, &cmd); } -int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu) -{ - union gve_adminq_command cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = cpu_to_be32(GVE_ADMINQ_SET_DRIVER_PARAMETER); - cmd.set_driver_param = (struct gve_adminq_set_driver_parameter) { - .parameter_type = cpu_to_be32(GVE_SET_PARAM_MTU), - .parameter_value = cpu_to_be64(mtu), - }; - - return gve_adminq_execute_cmd(priv, &cmd); -} - int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, dma_addr_t stats_report_addr, u64 interval) { diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index 863683de9694..228217458275 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -612,7 +612,6 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id); int gve_adminq_register_page_list(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id); -int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, dma_addr_t stats_report_addr, u64 interval); int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index f879426cb552..394debc62268 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -1146,8 +1146,7 @@ static void gve_handle_miss_completion(struct gve_priv *priv, /* jiffies can wraparound but time comparisons can handle overflows. */ pending_packet->timeout_jiffies = jiffies + - msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * - MSEC_PER_SEC); + secs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT); add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); *bytes += pending_packet->skb->len; @@ -1191,8 +1190,7 @@ static void remove_miss_completions(struct gve_priv *priv, pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; pending_packet->timeout_jiffies = jiffies + - msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * - MSEC_PER_SEC); + secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT); /* Maintain pending packet in another list so the packet can be * unallocated at a later time. */ diff --git a/drivers/net/ethernet/hisilicon/hibmcge/Makefile b/drivers/net/ethernet/hisilicon/hibmcge/Makefile index ae58ac38c206..7ea15f9ef849 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/Makefile +++ b/drivers/net/ethernet/hisilicon/hibmcge/Makefile @@ -5,4 +5,5 @@ obj-$(CONFIG_HIBMCGE) += hibmcge.o -hibmcge-objs = hbg_main.o hbg_hw.o hbg_mdio.o hbg_irq.o hbg_txrx.o hbg_ethtool.o +hibmcge-objs = hbg_main.o hbg_hw.o hbg_mdio.o hbg_irq.o hbg_txrx.o hbg_ethtool.o \ + hbg_debugfs.o hbg_err.o diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h index 96daf058d387..b4300d8ea4ad 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h @@ -4,6 +4,7 @@ #ifndef __HBG_COMMON_H #define __HBG_COMMON_H +#include <linux/ethtool.h> #include <linux/netdevice.h> #include <linux/pci.h> #include "hbg_reg.h" @@ -33,6 +34,14 @@ enum hbg_tx_state { enum hbg_nic_state { HBG_NIC_STATE_EVENT_HANDLING = 0, + HBG_NIC_STATE_RESETTING, + HBG_NIC_STATE_RESET_FAIL, +}; + +enum hbg_reset_type { + HBG_RESET_TYPE_NONE = 0, + HBG_RESET_TYPE_FLR, + HBG_RESET_TYPE_FUNCTION, }; struct hbg_buffer { @@ -84,6 +93,7 @@ struct hbg_dev_specs { u32 vlan_layers; u32 max_mtu; u32 min_mtu; + u32 uc_mac_num; u32 max_frame_len; u32 rx_buf_size; @@ -114,6 +124,22 @@ struct hbg_mac { u32 duplex; u32 autoneg; u32 link_status; + u32 pause_autoneg; +}; + +struct hbg_mac_table_entry { + u8 addr[ETH_ALEN]; +}; + +struct hbg_mac_filter { + struct hbg_mac_table_entry *mac_table; + u32 table_max_len; + bool enabled; +}; + +/* saved for restore after rest */ +struct hbg_user_def { + struct ethtool_pauseparam pause_param; }; struct hbg_priv { @@ -126,6 +152,9 @@ struct hbg_priv { struct hbg_vector vectors; struct hbg_ring tx_ring; struct hbg_ring rx_ring; + struct hbg_mac_filter filter; + enum hbg_reset_type reset_type; + struct hbg_user_def user_def; }; #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c new file mode 100644 index 000000000000..8473c43d171a --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (c) 2024 Hisilicon Limited. + +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/etherdevice.h> +#include <linux/seq_file.h> +#include <linux/string_choices.h> +#include "hbg_common.h" +#include "hbg_debugfs.h" +#include "hbg_hw.h" +#include "hbg_irq.h" +#include "hbg_txrx.h" + +static struct dentry *hbg_dbgfs_root; + +struct hbg_dbg_info { + const char *name; + int (*read)(struct seq_file *seq, void *data); +}; + +#define state_str_true_false(p, s) str_true_false(test_bit(s, &(p)->state)) + +static void hbg_dbg_ring(struct hbg_priv *priv, struct hbg_ring *ring, + struct seq_file *s) +{ + u32 irq_mask = ring->dir == HBG_DIR_TX ? HBG_INT_MSK_TX_B : + HBG_INT_MSK_RX_B; + + seq_printf(s, "ring used num: %u\n", + hbg_get_queue_used_num(ring)); + seq_printf(s, "ring max num: %u\n", ring->len); + seq_printf(s, "ring head: %u, tail: %u\n", ring->head, ring->tail); + seq_printf(s, "fifo used num: %u\n", + hbg_hw_get_fifo_used_num(priv, ring->dir)); + seq_printf(s, "fifo max num: %u\n", + hbg_get_spec_fifo_max_num(priv, ring->dir)); + seq_printf(s, "irq enabled: %s\n", + str_true_false(hbg_hw_irq_is_enabled(priv, irq_mask))); +} + +static int hbg_dbg_tx_ring(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_dbg_ring(priv, &priv->tx_ring, s); + return 0; +} + +static int hbg_dbg_rx_ring(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_dbg_ring(priv, &priv->rx_ring, s); + return 0; +} + +static int hbg_dbg_irq_info(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_irq_info *info; + u32 i; + + for (i = 0; i < priv->vectors.info_array_len; i++) { + info = &priv->vectors.info_array[i]; + seq_printf(s, + "%-20s: enabled: %-5s, logged: %-5s, count: %llu\n", + info->name, + str_true_false(hbg_hw_irq_is_enabled(priv, + info->mask)), + str_true_false(info->need_print), + info->count); + } + + return 0; +} + +static int hbg_dbg_mac_table(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_mac_filter *filter; + u32 i; + + filter = &priv->filter; + seq_printf(s, "mac addr max count: %u\n", filter->table_max_len); + seq_printf(s, "filter enabled: %s\n", str_true_false(filter->enabled)); + + for (i = 0; i < filter->table_max_len; i++) { + if (is_zero_ether_addr(filter->mac_table[i].addr)) + continue; + + seq_printf(s, "[%u] %pM\n", i, filter->mac_table[i].addr); + } + + return 0; +} + +static const char * const reset_type_str[] = {"None", "FLR", "Function"}; + +static int hbg_dbg_nic_state(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + seq_printf(s, "event handling state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_EVENT_HANDLING)); + seq_printf(s, "resetting state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_RESETTING)); + seq_printf(s, "reset fail state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_RESET_FAIL)); + seq_printf(s, "last reset type: %s\n", + reset_type_str[priv->reset_type]); + + return 0; +} + +static const struct hbg_dbg_info hbg_dbg_infos[] = { + { "tx_ring", hbg_dbg_tx_ring }, + { "rx_ring", hbg_dbg_rx_ring }, + { "irq_info", hbg_dbg_irq_info }, + { "mac_table", hbg_dbg_mac_table }, + { "nic_state", hbg_dbg_nic_state }, +}; + +static void hbg_debugfs_uninit(void *data) +{ + debugfs_remove_recursive((struct dentry *)data); +} + +void hbg_debugfs_init(struct hbg_priv *priv) +{ + const char *name = pci_name(priv->pdev); + struct device *dev = &priv->pdev->dev; + struct dentry *root; + u32 i; + + root = debugfs_create_dir(name, hbg_dbgfs_root); + + for (i = 0; i < ARRAY_SIZE(hbg_dbg_infos); i++) + debugfs_create_devm_seqfile(dev, hbg_dbg_infos[i].name, + root, hbg_dbg_infos[i].read); + + /* Ignore the failure because debugfs is not a key feature. */ + devm_add_action_or_reset(dev, hbg_debugfs_uninit, root); +} + +void hbg_debugfs_register(void) +{ + hbg_dbgfs_root = debugfs_create_dir("hibmcge", NULL); +} + +void hbg_debugfs_unregister(void) +{ + debugfs_remove_recursive(hbg_dbgfs_root); + hbg_dbgfs_root = NULL; +} diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h new file mode 100644 index 000000000000..80670d66bbeb --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2024 Hisilicon Limited. */ + +#ifndef __HBG_DEBUGFS_H +#define __HBG_DEBUGFS_H + +void hbg_debugfs_register(void); +void hbg_debugfs_unregister(void); + +void hbg_debugfs_init(struct hbg_priv *priv); + +#endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c new file mode 100644 index 000000000000..4d1f4a33391a --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (c) 2024 Hisilicon Limited. + +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/phy.h> +#include <linux/rtnetlink.h> +#include "hbg_common.h" +#include "hbg_err.h" +#include "hbg_hw.h" + +static void hbg_restore_mac_table(struct hbg_priv *priv) +{ + struct hbg_mac_filter *filter = &priv->filter; + u64 addr; + u32 i; + + for (i = 0; i < filter->table_max_len; i++) + if (!is_zero_ether_addr(filter->mac_table[i].addr)) { + addr = ether_addr_to_u64(filter->mac_table[i].addr); + hbg_hw_set_uc_addr(priv, addr, i); + } + + hbg_hw_set_mac_filter_enable(priv, priv->filter.enabled); +} + +static void hbg_restore_user_def_settings(struct hbg_priv *priv) +{ + struct ethtool_pauseparam *pause_param = &priv->user_def.pause_param; + + hbg_restore_mac_table(priv); + hbg_hw_set_mtu(priv, priv->netdev->mtu); + hbg_hw_set_pause_enable(priv, pause_param->tx_pause, + pause_param->rx_pause); +} + +int hbg_rebuild(struct hbg_priv *priv) +{ + int ret; + + ret = hbg_hw_init(priv); + if (ret) + return ret; + + hbg_restore_user_def_settings(priv); + return 0; +} + +static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) +{ + int ret; + + ASSERT_RTNL(); + + if (netif_running(priv->netdev)) { + dev_warn(&priv->pdev->dev, + "failed to reset because port is up\n"); + return -EBUSY; + } + + priv->reset_type = type; + set_bit(HBG_NIC_STATE_RESETTING, &priv->state); + clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + } + + return ret; +} + +static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type) +{ + int ret; + + if (!test_bit(HBG_NIC_STATE_RESETTING, &priv->state) || + type != priv->reset_type) + return 0; + + ASSERT_RTNL(); + + clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + ret = hbg_rebuild(priv); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + dev_err(&priv->pdev->dev, "failed to rebuild after reset\n"); + return ret; + } + + dev_info(&priv->pdev->dev, "reset done\n"); + return ret; +} + +/* must be protected by rtnl lock */ +int hbg_reset(struct hbg_priv *priv) +{ + int ret; + + ASSERT_RTNL(); + ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION); + if (ret) + return ret; + + return hbg_reset_done(priv, HBG_RESET_TYPE_FUNCTION); +} + +static void hbg_pci_err_reset_prepare(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + + rtnl_lock(); + hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR); +} + +static void hbg_pci_err_reset_done(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_reset_done(priv, HBG_RESET_TYPE_FLR); + rtnl_unlock(); +} + +static const struct pci_error_handlers hbg_pci_err_handler = { + .reset_prepare = hbg_pci_err_reset_prepare, + .reset_done = hbg_pci_err_reset_done, +}; + +void hbg_set_pci_err_handler(struct pci_driver *pdrv) +{ + pdrv->err_handler = &hbg_pci_err_handler; +} diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h new file mode 100644 index 000000000000..d7828e446308 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2024 Hisilicon Limited. */ + +#ifndef __HBG_ERR_H +#define __HBG_ERR_H + +#include <linux/pci.h> + +void hbg_set_pci_err_handler(struct pci_driver *pdrv); +int hbg_reset(struct hbg_priv *priv); +int hbg_rebuild(struct hbg_priv *priv); + +#endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index c3370114aef3..00364a438ec2 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -3,12 +3,193 @@ #include <linux/ethtool.h> #include <linux/phy.h> +#include <linux/rtnetlink.h> +#include "hbg_common.h" +#include "hbg_err.h" #include "hbg_ethtool.h" +#include "hbg_hw.h" + +enum hbg_reg_dump_type { + HBG_DUMP_REG_TYPE_SPEC = 0, + HBG_DUMP_REG_TYPE_MDIO, + HBG_DUMP_REG_TYPE_GMAC, + HBG_DUMP_REG_TYPE_PCU, +}; + +struct hbg_reg_info { + u32 type; + u32 offset; + u32 val; +}; + +#define HBG_DUMP_SPEC_I(offset) {HBG_DUMP_REG_TYPE_SPEC, offset, 0} +#define HBG_DUMP_MDIO_I(offset) {HBG_DUMP_REG_TYPE_MDIO, offset, 0} +#define HBG_DUMP_GMAC_I(offset) {HBG_DUMP_REG_TYPE_GMAC, offset, 0} +#define HBG_DUMP_PCU_I(offset) {HBG_DUMP_REG_TYPE_PCU, offset, 0} + +static const struct hbg_reg_info hbg_dump_reg_infos[] = { + /* dev specs */ + HBG_DUMP_SPEC_I(HBG_REG_SPEC_VALID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_EVENT_REQ_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_PHY_ID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ADDR_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ADDR_HIGH_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_UC_MAC_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MDIO_FREQ_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAX_MTU_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MIN_MTU_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_TX_FIFO_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_RX_FIFO_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_VLAN_LAYERS_ADDR), + + /* mdio */ + HBG_DUMP_MDIO_I(HBG_REG_MDIO_COMMAND_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_ADDR_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_WDATA_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_RDATA_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_STA_ADDR), + + /* gmac */ + HBG_DUMP_GMAC_I(HBG_REG_DUPLEX_TYPE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_TYPE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FC_TX_TIMER_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_ADDR_LOW_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_ADDR_HIGH_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_MAX_FRAME_SIZE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PORT_MODE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PORT_ENABLE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PAUSE_ENABLE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_AN_NEG_STATE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_TRANSMIT_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_REC_FILT_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_LINE_LOOP_BACK_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_CF_CRC_STRIP_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_MODE_CHANGE_EN_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_LOOP_REG_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_RECV_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_VLAN_CODE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_0_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_0_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_1_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_1_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_2_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_2_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_3_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_3_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_4_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_4_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_5_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_5_ADDR), + + /* pcu */ + HBG_DUMP_PCU_I(HBG_REG_TX_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CFG_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_CLR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_TX_BUS_ERR_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_BUS_ERR_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_MAX_FRAME_LEN_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DEBUG_ST_MCH_ADDR), + HBG_DUMP_PCU_I(HBG_REG_FIFO_CURR_STATUS_ADDR), + HBG_DUMP_PCU_I(HBG_REG_FIFO_HIST_STATUS_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_CFF_DATA_NUM_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_TX_PAUSE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_CFF_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_BUF_SIZE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_BUS_CTRL_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_CTRL_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_PKT_MODE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST0_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST1_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST2_ADDR), + HBG_DUMP_PCU_I(HBG_REG_BUS_RST_EN_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_CLR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_CLR_ADDR), +}; + +static const u32 hbg_dump_type_base_array[] = { + [HBG_DUMP_REG_TYPE_SPEC] = 0, + [HBG_DUMP_REG_TYPE_MDIO] = HBG_REG_MDIO_BASE, + [HBG_DUMP_REG_TYPE_GMAC] = HBG_REG_SGMII_BASE, + [HBG_DUMP_REG_TYPE_PCU] = HBG_REG_SGMII_BASE, +}; + +static int hbg_ethtool_get_regs_len(struct net_device *netdev) +{ + return ARRAY_SIZE(hbg_dump_reg_infos) * sizeof(struct hbg_reg_info); +} + +static void hbg_ethtool_get_regs(struct net_device *netdev, + struct ethtool_regs *regs, void *data) +{ + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_reg_info *info; + u32 i, offset = 0; + + regs->version = 0; + for (i = 0; i < ARRAY_SIZE(hbg_dump_reg_infos); i++) { + info = data + offset; + + *info = hbg_dump_reg_infos[i]; + info->val = hbg_reg_read(priv, info->offset); + info->offset -= hbg_dump_type_base_array[info->type]; + + offset += sizeof(*info); + } +} + +static void hbg_ethtool_get_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *param) +{ + struct hbg_priv *priv = netdev_priv(net_dev); + + param->autoneg = priv->mac.pause_autoneg; + hbg_hw_get_pause_enable(priv, ¶m->tx_pause, ¶m->rx_pause); +} + +static int hbg_ethtool_set_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *param) +{ + struct hbg_priv *priv = netdev_priv(net_dev); + + priv->mac.pause_autoneg = param->autoneg; + phy_set_asym_pause(priv->mac.phydev, param->rx_pause, param->tx_pause); + + if (!param->autoneg) + hbg_hw_set_pause_enable(priv, param->tx_pause, param->rx_pause); + + priv->user_def.pause_param = *param; + return 0; +} + +static int hbg_ethtool_reset(struct net_device *netdev, u32 *flags) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + if (*flags != ETH_RESET_DEDICATED) + return -EOPNOTSUPP; + + *flags = 0; + return hbg_reset(priv); +} static const struct ethtool_ops hbg_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, + .get_regs_len = hbg_ethtool_get_regs_len, + .get_regs = hbg_ethtool_get_regs, + .get_pauseparam = hbg_ethtool_get_pauseparam, + .set_pauseparam = hbg_ethtool_set_pauseparam, + .reset = hbg_ethtool_reset, + .nway_reset = phy_ethtool_nway_reset, }; void hbg_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c index 05295c2ad439..e7798f213645 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c @@ -3,6 +3,7 @@ #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/if_vlan.h> #include <linux/iopoll.h> #include <linux/minmax.h> #include "hbg_common.h" @@ -67,6 +68,8 @@ static int hbg_hw_dev_specs_init(struct hbg_priv *priv) specs->vlan_layers = hbg_reg_read(priv, HBG_REG_VLAN_LAYERS_ADDR); specs->rx_fifo_num = hbg_reg_read(priv, HBG_REG_RX_FIFO_NUM_ADDR); specs->tx_fifo_num = hbg_reg_read(priv, HBG_REG_TX_FIFO_NUM_ADDR); + specs->uc_mac_num = hbg_reg_read(priv, HBG_REG_UC_MAC_NUM_ADDR); + mac_addr = hbg_reg_read64(priv, HBG_REG_MAC_ADDR_ADDR); u64_to_ether_addr(mac_addr, (u8 *)specs->mac_addr.sa_data); @@ -135,9 +138,13 @@ void hbg_hw_irq_enable(struct hbg_priv *priv, u32 mask, bool enable) hbg_reg_write(priv, HBG_REG_CF_INTRPT_MSK_ADDR, value); } -void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr) +void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr, u32 index) { - hbg_reg_write64(priv, HBG_REG_STATION_ADDR_LOW_2_ADDR, mac_addr); + u32 addr; + + /* mac addr is u64, so the addr offset is 0x8 */ + addr = HBG_REG_STATION_ADDR_LOW_2_ADDR + (index * 0x8); + hbg_reg_write64(priv, addr, mac_addr); } static void hbg_hw_set_pcu_max_frame_len(struct hbg_priv *priv, @@ -161,8 +168,13 @@ static void hbg_hw_set_mac_max_frame_len(struct hbg_priv *priv, void hbg_hw_set_mtu(struct hbg_priv *priv, u16 mtu) { - hbg_hw_set_pcu_max_frame_len(priv, mtu); - hbg_hw_set_mac_max_frame_len(priv, mtu); + u32 frame_len; + + frame_len = mtu + VLAN_HLEN * priv->dev_specs.vlan_layers + + ETH_HLEN + ETH_FCS_LEN; + + hbg_hw_set_pcu_max_frame_len(priv, frame_len); + hbg_hw_set_mac_max_frame_len(priv, frame_len); } void hbg_hw_mac_enable(struct hbg_priv *priv, u32 enable) @@ -207,6 +219,34 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) HBG_REG_DUPLEX_B, duplex); } +/* only support uc filter */ +void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable) +{ + hbg_reg_write_field(priv, HBG_REG_REC_FILT_CTRL_ADDR, + HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B, enable); +} + +void hbg_hw_set_pause_enable(struct hbg_priv *priv, u32 tx_en, u32 rx_en) +{ + hbg_reg_write_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_TX_B, tx_en); + hbg_reg_write_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_RX_B, rx_en); +} + +void hbg_hw_get_pause_enable(struct hbg_priv *priv, u32 *tx_en, u32 *rx_en) +{ + *tx_en = hbg_reg_read_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_TX_B); + *rx_en = hbg_reg_read_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_RX_B); +} + +void hbg_hw_set_rx_pause_mac_addr(struct hbg_priv *priv, u64 mac_addr) +{ + hbg_reg_write64(priv, HBG_REG_FD_FC_ADDR_LOW_ADDR, mac_addr); +} + static void hbg_hw_init_transmit_ctrl(struct hbg_priv *priv) { u32 ctrl = 0; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h index 14fb39241c93..a4a049b5121d 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h @@ -51,9 +51,13 @@ bool hbg_hw_irq_is_enabled(struct hbg_priv *priv, u32 mask); void hbg_hw_irq_enable(struct hbg_priv *priv, u32 mask, bool enable); void hbg_hw_set_mtu(struct hbg_priv *priv, u16 mtu); void hbg_hw_mac_enable(struct hbg_priv *priv, u32 enable); -void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr); +void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr, u32 index); u32 hbg_hw_get_fifo_used_num(struct hbg_priv *priv, enum hbg_dir dir); void hbg_hw_set_tx_desc(struct hbg_priv *priv, struct hbg_tx_desc *tx_desc); void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr); +void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable); +void hbg_hw_set_pause_enable(struct hbg_priv *priv, u32 tx_en, u32 rx_en); +void hbg_hw_get_pause_enable(struct hbg_priv *priv, u32 *tx_en, u32 *rx_en); +void hbg_hw_set_rx_pause_mac_addr(struct hbg_priv *priv, u64 mac_addr); #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 75505fb5cc4a..bb0f25ac9760 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -6,13 +6,13 @@ #include <linux/netdevice.h> #include <linux/pci.h> #include "hbg_common.h" +#include "hbg_err.h" #include "hbg_ethtool.h" #include "hbg_hw.h" #include "hbg_irq.h" #include "hbg_mdio.h" #include "hbg_txrx.h" - -static void hbg_change_mtu(struct hbg_priv *priv, int new_mtu); +#include "hbg_debugfs.h" static void hbg_all_irq_enable(struct hbg_priv *priv, bool enabled) { @@ -55,11 +55,7 @@ static int hbg_hw_txrx_clear(struct hbg_priv *priv) return ret; /* After reset, regs need to be reconfigured */ - hbg_hw_init(priv); - hbg_hw_set_uc_addr(priv, ether_addr_to_u64(priv->netdev->dev_addr)); - hbg_change_mtu(priv, priv->netdev->mtu); - - return 0; + return hbg_rebuild(priv); } static int hbg_net_stop(struct net_device *netdev) @@ -74,31 +70,127 @@ static int hbg_net_stop(struct net_device *netdev) return hbg_hw_txrx_clear(priv); } +static void hbg_update_promisc_mode(struct net_device *netdev, bool overflow) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + /* Only when not table_overflow, and netdev->flags not set IFF_PROMISC, + * The MAC filter will be enabled. + * Otherwise the filter will be disabled. + */ + priv->filter.enabled = !(overflow || (netdev->flags & IFF_PROMISC)); + hbg_hw_set_mac_filter_enable(priv, priv->filter.enabled); +} + +static void hbg_set_mac_to_mac_table(struct hbg_priv *priv, + u32 index, const u8 *addr) +{ + if (addr) { + ether_addr_copy(priv->filter.mac_table[index].addr, addr); + hbg_hw_set_uc_addr(priv, ether_addr_to_u64(addr), index); + } else { + eth_zero_addr(priv->filter.mac_table[index].addr); + hbg_hw_set_uc_addr(priv, 0, index); + } +} + +static int hbg_get_index_from_mac_table(struct hbg_priv *priv, + const u8 *addr, u32 *index) +{ + u32 i; + + for (i = 0; i < priv->filter.table_max_len; i++) + if (ether_addr_equal(priv->filter.mac_table[i].addr, addr)) { + *index = i; + return 0; + } + + return -EINVAL; +} + +static int hbg_add_mac_to_filter(struct hbg_priv *priv, const u8 *addr) +{ + u32 index; + + /* already exists */ + if (!hbg_get_index_from_mac_table(priv, addr, &index)) + return 0; + + for (index = 0; index < priv->filter.table_max_len; index++) + if (is_zero_ether_addr(priv->filter.mac_table[index].addr)) { + hbg_set_mac_to_mac_table(priv, index, addr); + return 0; + } + + return -ENOSPC; +} + +static void hbg_del_mac_from_filter(struct hbg_priv *priv, const u8 *addr) +{ + u32 index; + + /* not exists */ + if (hbg_get_index_from_mac_table(priv, addr, &index)) + return; + + hbg_set_mac_to_mac_table(priv, index, NULL); +} + +static int hbg_uc_sync(struct net_device *netdev, const unsigned char *addr) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + return hbg_add_mac_to_filter(priv, addr); +} + +static int hbg_uc_unsync(struct net_device *netdev, const unsigned char *addr) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + if (ether_addr_equal(netdev->dev_addr, (u8 *)addr)) + return 0; + + hbg_del_mac_from_filter(priv, addr); + return 0; +} + +static void hbg_net_set_rx_mode(struct net_device *netdev) +{ + int ret; + + ret = __dev_uc_sync(netdev, hbg_uc_sync, hbg_uc_unsync); + + /* If ret != 0, overflow has occurred */ + hbg_update_promisc_mode(netdev, !!ret); +} + static int hbg_net_set_mac_address(struct net_device *netdev, void *addr) { struct hbg_priv *priv = netdev_priv(netdev); u8 *mac_addr; + bool exists; + u32 index; mac_addr = ((struct sockaddr *)addr)->sa_data; if (!is_valid_ether_addr(mac_addr)) return -EADDRNOTAVAIL; - hbg_hw_set_uc_addr(priv, ether_addr_to_u64(mac_addr)); - dev_addr_set(netdev, mac_addr); + /* The index of host mac is always 0. + * If new mac address already exists, + * delete the existing mac address and + * add it to the position with index 0. + */ + exists = !hbg_get_index_from_mac_table(priv, mac_addr, &index); + hbg_set_mac_to_mac_table(priv, 0, mac_addr); + if (exists) + hbg_set_mac_to_mac_table(priv, index, NULL); + hbg_hw_set_rx_pause_mac_addr(priv, ether_addr_to_u64(mac_addr)); + dev_addr_set(netdev, mac_addr); return 0; } -static void hbg_change_mtu(struct hbg_priv *priv, int new_mtu) -{ - u32 frame_len; - - frame_len = new_mtu + VLAN_HLEN * priv->dev_specs.vlan_layers + - ETH_HLEN + ETH_FCS_LEN; - hbg_hw_set_mtu(priv, frame_len); -} - static int hbg_net_change_mtu(struct net_device *netdev, int new_mtu) { struct hbg_priv *priv = netdev_priv(netdev); @@ -106,7 +198,7 @@ static int hbg_net_change_mtu(struct net_device *netdev, int new_mtu) if (netif_running(netdev)) return -EBUSY; - hbg_change_mtu(priv, new_mtu); + hbg_hw_set_mtu(priv, new_mtu); WRITE_ONCE(netdev->mtu, new_mtu); dev_dbg(&priv->pdev->dev, @@ -142,8 +234,39 @@ static const struct net_device_ops hbg_netdev_ops = { .ndo_set_mac_address = hbg_net_set_mac_address, .ndo_change_mtu = hbg_net_change_mtu, .ndo_tx_timeout = hbg_net_tx_timeout, + .ndo_set_rx_mode = hbg_net_set_rx_mode, }; +static int hbg_mac_filter_init(struct hbg_priv *priv) +{ + struct hbg_dev_specs *dev_specs = &priv->dev_specs; + struct hbg_mac_filter *filter = &priv->filter; + struct hbg_mac_table_entry *tmp_table; + + tmp_table = devm_kcalloc(&priv->pdev->dev, dev_specs->uc_mac_num, + sizeof(*tmp_table), GFP_KERNEL); + if (!tmp_table) + return -ENOMEM; + + filter->mac_table = tmp_table; + filter->table_max_len = dev_specs->uc_mac_num; + filter->enabled = true; + + hbg_hw_set_mac_filter_enable(priv, filter->enabled); + return 0; +} + +static void hbg_init_user_def(struct hbg_priv *priv) +{ + struct ethtool_pauseparam *pause_param = &priv->user_def.pause_param; + + priv->mac.pause_autoneg = HBG_STATUS_ENABLE; + + pause_param->autoneg = priv->mac.pause_autoneg; + hbg_hw_get_pause_enable(priv, &pause_param->tx_pause, + &pause_param->rx_pause); +} + static int hbg_init(struct hbg_priv *priv) { int ret; @@ -160,7 +283,17 @@ static int hbg_init(struct hbg_priv *priv) if (ret) return ret; - return hbg_mdio_init(priv); + ret = hbg_mdio_init(priv); + if (ret) + return ret; + + ret = hbg_mac_filter_init(priv); + if (ret) + return ret; + + hbg_debugfs_init(priv); + hbg_init_user_def(priv); + return 0; } static int hbg_pci_init(struct pci_dev *pdev) @@ -216,13 +349,15 @@ static int hbg_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) return ret; + netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netdev->max_mtu = priv->dev_specs.max_mtu; netdev->min_mtu = priv->dev_specs.min_mtu; netdev->netdev_ops = &hbg_netdev_ops; netdev->watchdog_timeo = 5 * HZ; - hbg_change_mtu(priv, ETH_DATA_LEN); + hbg_hw_set_mtu(priv, ETH_DATA_LEN); hbg_net_set_mac_address(priv->netdev, &priv->dev_specs.mac_addr); hbg_ethtool_set_ops(netdev); @@ -245,7 +380,27 @@ static struct pci_driver hbg_driver = { .id_table = hbg_pci_tbl, .probe = hbg_probe, }; -module_pci_driver(hbg_driver); + +static int __init hbg_module_init(void) +{ + int ret; + + hbg_debugfs_register(); + hbg_set_pci_err_handler(&hbg_driver); + ret = pci_register_driver(&hbg_driver); + if (ret) + hbg_debugfs_unregister(); + + return ret; +} +module_init(hbg_module_init); + +static void __exit hbg_module_exit(void) +{ + pci_unregister_driver(&hbg_driver); + hbg_debugfs_unregister(); +} +module_exit(hbg_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huawei Tech. Co., Ltd."); diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c index a3479fba8501..db6bc4cfb971 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c @@ -114,6 +114,19 @@ static void hbg_mdio_init_hw(struct hbg_priv *priv) hbg_mdio_set_command(mac, cmd); } +static void hbg_flowctrl_cfg(struct hbg_priv *priv) +{ + struct phy_device *phydev = priv->mac.phydev; + bool rx_pause; + bool tx_pause; + + if (!priv->mac.pause_autoneg) + return; + + phy_get_pause(phydev, &tx_pause, &rx_pause); + hbg_hw_set_pause_enable(priv, tx_pause, rx_pause); +} + static void hbg_phy_adjust_link(struct net_device *netdev) { struct hbg_priv *priv = netdev_priv(netdev); @@ -140,6 +153,7 @@ static void hbg_phy_adjust_link(struct net_device *netdev) priv->mac.duplex = phydev->duplex; priv->mac.autoneg = phydev->autoneg; hbg_hw_adjust_link(priv, speed, phydev->duplex); + hbg_flowctrl_cfg(priv); } priv->mac.link_status = phydev->link; @@ -168,6 +182,7 @@ static int hbg_phy_connect(struct hbg_priv *priv) return ret; phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT); + phy_support_asym_pause(phydev); phy_attached_info(phydev); return 0; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h index 57d81c6d7633..f12efc12f3c5 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h @@ -10,6 +10,8 @@ #define HBG_REG_MAC_ID_ADDR 0x0008 #define HBG_REG_PHY_ID_ADDR 0x000C #define HBG_REG_MAC_ADDR_ADDR 0x0010 +#define HBG_REG_MAC_ADDR_HIGH_ADDR 0x0014 +#define HBG_REG_UC_MAC_NUM_ADDR 0x0018 #define HBG_REG_MDIO_FREQ_ADDR 0x0024 #define HBG_REG_MAX_MTU_ADDR 0x0028 #define HBG_REG_MIN_MTU_ADDR 0x002C @@ -28,6 +30,7 @@ #define HBG_REG_MDIO_COMMAND_OP_M GENMASK(11, 10) #define HBG_REG_MDIO_COMMAND_PRTAD_M GENMASK(9, 5) #define HBG_REG_MDIO_COMMAND_DEVAD_M GENMASK(4, 0) +#define HBG_REG_MDIO_ADDR_ADDR (HBG_REG_MDIO_BASE + 0x0004) #define HBG_REG_MDIO_WDATA_ADDR (HBG_REG_MDIO_BASE + 0x0008) #define HBG_REG_MDIO_WDATA_M GENMASK(15, 0) #define HBG_REG_MDIO_RDATA_ADDR (HBG_REG_MDIO_BASE + 0x000C) @@ -36,6 +39,10 @@ /* GMAC */ #define HBG_REG_SGMII_BASE 0x10000 #define HBG_REG_DUPLEX_TYPE_ADDR (HBG_REG_SGMII_BASE + 0x0008) +#define HBG_REG_FD_FC_TYPE_ADDR (HBG_REG_SGMII_BASE + 0x000C) +#define HBG_REG_FC_TX_TIMER_ADDR (HBG_REG_SGMII_BASE + 0x001C) +#define HBG_REG_FD_FC_ADDR_LOW_ADDR (HBG_REG_SGMII_BASE + 0x0020) +#define HBG_REG_FD_FC_ADDR_HIGH_ADDR (HBG_REG_SGMII_BASE + 0x0024) #define HBG_REG_DUPLEX_B BIT(0) #define HBG_REG_MAX_FRAME_SIZE_ADDR (HBG_REG_SGMII_BASE + 0x003C) #define HBG_REG_PORT_MODE_ADDR (HBG_REG_SGMII_BASE + 0x0040) @@ -43,20 +50,42 @@ #define HBG_REG_PORT_ENABLE_ADDR (HBG_REG_SGMII_BASE + 0x0044) #define HBG_REG_PORT_ENABLE_RX_B BIT(1) #define HBG_REG_PORT_ENABLE_TX_B BIT(2) +#define HBG_REG_PAUSE_ENABLE_ADDR (HBG_REG_SGMII_BASE + 0x0048) +#define HBG_REG_PAUSE_ENABLE_RX_B BIT(0) +#define HBG_REG_PAUSE_ENABLE_TX_B BIT(1) +#define HBG_REG_AN_NEG_STATE_ADDR (HBG_REG_SGMII_BASE + 0x0058) #define HBG_REG_TRANSMIT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0060) #define HBG_REG_TRANSMIT_CTRL_PAD_EN_B BIT(7) #define HBG_REG_TRANSMIT_CTRL_CRC_ADD_B BIT(6) #define HBG_REG_TRANSMIT_CTRL_AN_EN_B BIT(5) +#define HBG_REG_REC_FILT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0064) +#define HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B BIT(0) +#define HBG_REG_LINE_LOOP_BACK_ADDR (HBG_REG_SGMII_BASE + 0x01A8) #define HBG_REG_CF_CRC_STRIP_ADDR (HBG_REG_SGMII_BASE + 0x01B0) #define HBG_REG_CF_CRC_STRIP_B BIT(0) #define HBG_REG_MODE_CHANGE_EN_ADDR (HBG_REG_SGMII_BASE + 0x01B4) #define HBG_REG_MODE_CHANGE_EN_B BIT(0) +#define HBG_REG_LOOP_REG_ADDR (HBG_REG_SGMII_BASE + 0x01DC) #define HBG_REG_RECV_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x01E0) #define HBG_REG_RECV_CTRL_STRIP_PAD_EN_B BIT(3) +#define HBG_REG_VLAN_CODE_ADDR (HBG_REG_SGMII_BASE + 0x01E8) +#define HBG_REG_STATION_ADDR_LOW_0_ADDR (HBG_REG_SGMII_BASE + 0x0200) +#define HBG_REG_STATION_ADDR_HIGH_0_ADDR (HBG_REG_SGMII_BASE + 0x0204) +#define HBG_REG_STATION_ADDR_LOW_1_ADDR (HBG_REG_SGMII_BASE + 0x0208) +#define HBG_REG_STATION_ADDR_HIGH_1_ADDR (HBG_REG_SGMII_BASE + 0x020C) #define HBG_REG_STATION_ADDR_LOW_2_ADDR (HBG_REG_SGMII_BASE + 0x0210) #define HBG_REG_STATION_ADDR_HIGH_2_ADDR (HBG_REG_SGMII_BASE + 0x0214) +#define HBG_REG_STATION_ADDR_LOW_3_ADDR (HBG_REG_SGMII_BASE + 0x0218) +#define HBG_REG_STATION_ADDR_HIGH_3_ADDR (HBG_REG_SGMII_BASE + 0x021C) +#define HBG_REG_STATION_ADDR_LOW_4_ADDR (HBG_REG_SGMII_BASE + 0x0220) +#define HBG_REG_STATION_ADDR_HIGH_4_ADDR (HBG_REG_SGMII_BASE + 0x0224) +#define HBG_REG_STATION_ADDR_LOW_5_ADDR (HBG_REG_SGMII_BASE + 0x0228) +#define HBG_REG_STATION_ADDR_HIGH_5_ADDR (HBG_REG_SGMII_BASE + 0x022C) /* PCU */ +#define HBG_REG_TX_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0420) +#define HBG_REG_RX_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0424) +#define HBG_REG_CFG_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0428) #define HBG_REG_CF_INTRPT_MSK_ADDR (HBG_REG_SGMII_BASE + 0x042C) #define HBG_INT_MSK_WE_ERR_B BIT(31) #define HBG_INT_MSK_RBREQ_ERR_B BIT(30) @@ -78,11 +107,17 @@ #define HBG_INT_MSK_RX_B BIT(0) /* just used in driver */ #define HBG_REG_CF_INTRPT_STAT_ADDR (HBG_REG_SGMII_BASE + 0x0434) #define HBG_REG_CF_INTRPT_CLR_ADDR (HBG_REG_SGMII_BASE + 0x0438) +#define HBG_REG_TX_BUS_ERR_ADDR_ADDR (HBG_REG_SGMII_BASE + 0x043C) +#define HBG_REG_RX_BUS_ERR_ADDR_ADDR (HBG_REG_SGMII_BASE + 0x0440) #define HBG_REG_MAX_FRAME_LEN_ADDR (HBG_REG_SGMII_BASE + 0x0444) #define HBG_REG_MAX_FRAME_LEN_M GENMASK(15, 0) +#define HBG_REG_DEBUG_ST_MCH_ADDR (HBG_REG_SGMII_BASE + 0x0450) +#define HBG_REG_FIFO_CURR_STATUS_ADDR (HBG_REG_SGMII_BASE + 0x0454) +#define HBG_REG_FIFO_HIST_STATUS_ADDR (HBG_REG_SGMII_BASE + 0x0458) #define HBG_REG_CF_CFF_DATA_NUM_ADDR (HBG_REG_SGMII_BASE + 0x045C) #define HBG_REG_CF_CFF_DATA_NUM_ADDR_TX_M GENMASK(8, 0) #define HBG_REG_CF_CFF_DATA_NUM_ADDR_RX_M GENMASK(24, 16) +#define HBG_REG_CF_TX_PAUSE_ADDR (HBG_REG_SGMII_BASE + 0x0470) #define HBG_REG_TX_CFF_ADDR_0_ADDR (HBG_REG_SGMII_BASE + 0x0488) #define HBG_REG_TX_CFF_ADDR_1_ADDR (HBG_REG_SGMII_BASE + 0x048C) #define HBG_REG_TX_CFF_ADDR_2_ADDR (HBG_REG_SGMII_BASE + 0x0490) @@ -101,6 +136,10 @@ #define HBG_REG_RX_CTRL_RXBUF_1ST_SKIP_SIZE2_M GENMASK(3, 0) #define HBG_REG_RX_PKT_MODE_ADDR (HBG_REG_SGMII_BASE + 0x04F4) #define HBG_REG_RX_PKT_MODE_PARSE_MODE_M GENMASK(22, 21) +#define HBG_REG_DBG_ST0_ADDR (HBG_REG_SGMII_BASE + 0x05E4) +#define HBG_REG_DBG_ST1_ADDR (HBG_REG_SGMII_BASE + 0x05E8) +#define HBG_REG_DBG_ST2_ADDR (HBG_REG_SGMII_BASE + 0x05EC) +#define HBG_REG_BUS_RST_EN_ADDR (HBG_REG_SGMII_BASE + 0x0688) #define HBG_REG_CF_IND_TXINT_MSK_ADDR (HBG_REG_SGMII_BASE + 0x0694) #define HBG_REG_IND_INTR_MASK_B BIT(0) #define HBG_REG_CF_IND_TXINT_STAT_ADDR (HBG_REG_SGMII_BASE + 0x0698) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.c b/drivers/net/ethernet/huawei/hinic/hinic_port.c index f81a43d2cdfc..486fb0e20bef 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_port.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_port.c @@ -469,7 +469,7 @@ int hinic_set_vlan_fliter(struct hinic_dev *nic_dev, u32 en) err = HINIC_MGMT_CMD_UNSUPPORTED; } else if (err || !out_size || vlan_filter.status) { dev_err(&pdev->dev, - "Failed to set vlan fliter, err: %d, status: 0x%x, out size: 0x%x\n", + "Failed to set vlan filter, err: %d, status: 0x%x, out size: 0x%x\n", err, vlan_filter.status, out_size); err = -EINVAL; } diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 20bc40eec487..24ec9a4f1ffa 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -292,6 +292,7 @@ config ICE select DIMLIB select LIBIE select NET_DEVLINK + select PACKING select PLDMFW select DPLL help diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index 3307d551f431..9e0d9f710441 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -32,7 +32,8 @@ ice-y := ice_main.o \ ice_parser_rt.o \ ice_idc.o \ devlink/devlink.o \ - devlink/devlink_port.o \ + devlink/health.o \ + devlink/port.o \ ice_sf_eth.o \ ice_sf_vsi_vlan_ops.o \ ice_ddp.o \ diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index 415445cefdb2..1b10682c00b8 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -6,7 +6,7 @@ #include "ice.h" #include "ice_lib.h" #include "devlink.h" -#include "devlink_port.h" +#include "port.h" #include "ice_eswitch.h" #include "ice_fw_update.h" #include "ice_dcb_lib.h" diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c new file mode 100644 index 000000000000..d23ae3aafaa7 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024, Intel Corporation. */ + +#include "health.h" +#include "ice.h" + +#define ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, obj, name) \ + devlink_fmsg_put(fmsg, #name, (obj)->name) + +/** + * ice_devlink_health_report - boilerplate to call given @reporter + * + * @reporter: devlink health reporter to call, do nothing on NULL + * @msg: message to pass up, "event name" is fine + * @priv_ctx: typically some event struct + */ +static void ice_devlink_health_report(struct devlink_health_reporter *reporter, + const char *msg, void *priv_ctx) +{ + if (!reporter) + return; + + /* We do not do auto recovering, so return value of the below function + * will always be 0, thus we do ignore it. + */ + devlink_health_report(reporter, msg, priv_ctx); +} + +struct ice_mdd_event { + enum ice_mdd_src src; + u16 vf_num; + u16 queue; + u8 pf_num; + u8 event; +}; + +static const char *ice_mdd_src_to_str(enum ice_mdd_src src) +{ + switch (src) { + case ICE_MDD_SRC_TX_PQM: + return "tx_pqm"; + case ICE_MDD_SRC_TX_TCLAN: + return "tx_tclan"; + case ICE_MDD_SRC_TX_TDPU: + return "tx_tdpu"; + case ICE_MDD_SRC_RX: + return "rx"; + default: + return "invalid"; + } +} + +static int +ice_mdd_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct ice_mdd_event *mdd_event = priv_ctx; + const char *src; + + if (!mdd_event) + return 0; + + src = ice_mdd_src_to_str(mdd_event->src); + + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "src", src); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, pf_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, vf_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, event); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, queue); + devlink_fmsg_obj_nest_end(fmsg); + + return 0; +} + +/** + * ice_report_mdd_event - Report an MDD event through devlink health + * @pf: the PF device structure + * @src: the HW block that was the source of this MDD event + * @pf_num: the pf_num on which the MDD event occurred + * @vf_num: the vf_num on which the MDD event occurred + * @event: the event type of the MDD event + * @queue: the queue on which the MDD event occurred + * + * Report an MDD event that has occurred on this PF. + */ +void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num, + u16 vf_num, u8 event, u16 queue) +{ + struct ice_mdd_event ev = { + .src = src, + .pf_num = pf_num, + .vf_num = vf_num, + .event = event, + .queue = queue, + }; + + ice_devlink_health_report(pf->health_reporters.mdd, "MDD event", &ev); +} + +/** + * ice_fmsg_put_ptr - put hex value of pointer into fmsg + * + * @fmsg: devlink fmsg under construction + * @name: name to pass + * @ptr: 64 bit value to print as hex and put into fmsg + */ +static void ice_fmsg_put_ptr(struct devlink_fmsg *fmsg, const char *name, + void *ptr) +{ + char buf[sizeof(ptr) * 3]; + + sprintf(buf, "%p", ptr); + devlink_fmsg_put(fmsg, name, buf); +} + +struct ice_tx_hang_event { + u32 head; + u32 intr; + u16 vsi_num; + u16 queue; + u16 next_to_clean; + u16 next_to_use; + struct ice_tx_ring *tx_ring; +}; + +static int ice_tx_hang_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct ice_tx_hang_event *event = priv_ctx; + struct sk_buff *skb; + + if (!event) + return 0; + + skb = event->tx_ring->tx_buf->skb; + devlink_fmsg_obj_nest_start(fmsg); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, head); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, intr); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, vsi_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, queue); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, next_to_clean); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, next_to_use); + devlink_fmsg_put(fmsg, "irq-mapping", event->tx_ring->q_vector->name); + ice_fmsg_put_ptr(fmsg, "desc-ptr", event->tx_ring->desc); + ice_fmsg_put_ptr(fmsg, "dma-ptr", (void *)(long)event->tx_ring->dma); + ice_fmsg_put_ptr(fmsg, "skb-ptr", skb); + devlink_fmsg_binary_pair_put(fmsg, "desc", event->tx_ring->desc, + event->tx_ring->count * sizeof(struct ice_tx_desc)); + devlink_fmsg_dump_skb(fmsg, skb); + devlink_fmsg_obj_nest_end(fmsg); + + return 0; +} + +void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring, + u16 vsi_num, u32 head, u32 intr) +{ + struct ice_health_tx_hang_buf *buf = &pf->health_reporters.tx_hang_buf; + + buf->tx_ring = tx_ring; + buf->vsi_num = vsi_num; + buf->head = head; + buf->intr = intr; +} + +void ice_report_tx_hang(struct ice_pf *pf) +{ + struct ice_health_tx_hang_buf *buf = &pf->health_reporters.tx_hang_buf; + struct ice_tx_ring *tx_ring = buf->tx_ring; + + struct ice_tx_hang_event ev = { + .head = buf->head, + .intr = buf->intr, + .vsi_num = buf->vsi_num, + .queue = tx_ring->q_index, + .next_to_clean = tx_ring->next_to_clean, + .next_to_use = tx_ring->next_to_use, + .tx_ring = tx_ring, + }; + + ice_devlink_health_report(pf->health_reporters.tx_hang, "Tx hang", &ev); +} + +static struct devlink_health_reporter * +ice_init_devlink_rep(struct ice_pf *pf, + const struct devlink_health_reporter_ops *ops) +{ + struct devlink *devlink = priv_to_devlink(pf); + struct devlink_health_reporter *rep; + const u64 graceful_period = 0; + + rep = devl_health_reporter_create(devlink, ops, graceful_period, pf); + if (IS_ERR(rep)) { + struct device *dev = ice_pf_to_dev(pf); + + dev_err(dev, "failed to create devlink %s health report er", + ops->name); + return NULL; + } + return rep; +} + +#define ICE_DEFINE_HEALTH_REPORTER_OPS(_name) \ + static const struct devlink_health_reporter_ops ice_ ## _name ## _reporter_ops = { \ + .name = #_name, \ + .dump = ice_ ## _name ## _reporter_dump, \ +} + +ICE_DEFINE_HEALTH_REPORTER_OPS(mdd); +ICE_DEFINE_HEALTH_REPORTER_OPS(tx_hang); + +/** + * ice_health_init - allocate and init all ice devlink health reporters and + * accompanied data + * + * @pf: PF struct + */ +void ice_health_init(struct ice_pf *pf) +{ + struct ice_health *reps = &pf->health_reporters; + + reps->mdd = ice_init_devlink_rep(pf, &ice_mdd_reporter_ops); + reps->tx_hang = ice_init_devlink_rep(pf, &ice_tx_hang_reporter_ops); +} + +/** + * ice_deinit_devl_reporter - destroy given devlink health reporter + * @reporter: reporter to destroy + */ +static void ice_deinit_devl_reporter(struct devlink_health_reporter *reporter) +{ + if (reporter) + devl_health_reporter_destroy(reporter); +} + +/** + * ice_health_deinit - deallocate all ice devlink health reporters and + * accompanied data + * + * @pf: PF struct + */ +void ice_health_deinit(struct ice_pf *pf) +{ + ice_deinit_devl_reporter(pf->health_reporters.mdd); + ice_deinit_devl_reporter(pf->health_reporters.tx_hang); +} + +static +void ice_health_assign_healthy_state(struct devlink_health_reporter *reporter) +{ + if (reporter) + devlink_health_reporter_state_update(reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); +} + +/** + * ice_health_clear - clear devlink health issues after a reset + * @pf: the PF device structure + * + * Mark the PF in healthy state again after a reset has completed. + */ +void ice_health_clear(struct ice_pf *pf) +{ + ice_health_assign_healthy_state(pf->health_reporters.mdd); + ice_health_assign_healthy_state(pf->health_reporters.tx_hang); +} diff --git a/drivers/net/ethernet/intel/ice/devlink/health.h b/drivers/net/ethernet/intel/ice/devlink/health.h new file mode 100644 index 000000000000..532277fc57d7 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/devlink/health.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024, Intel Corporation. */ + +#ifndef _HEALTH_H_ +#define _HEALTH_H_ + +#include <linux/types.h> + +/** + * DOC: health.h + * + * This header file stores everything that is needed for broadly understood + * devlink health mechanism for ice driver. + */ + +struct ice_pf; +struct ice_tx_ring; + +enum ice_mdd_src { + ICE_MDD_SRC_TX_PQM, + ICE_MDD_SRC_TX_TCLAN, + ICE_MDD_SRC_TX_TDPU, + ICE_MDD_SRC_RX, +}; + +/** + * struct ice_health - stores ice devlink health reporters and accompanied data + * @tx_hang: devlink health reporter for tx_hang event + * @mdd: devlink health reporter for MDD detection event + * @tx_hang_buf: pre-allocated place to put info for Tx hang reporter from + * non-sleeping context + * @tx_ring: ring that the hang occurred on + * @head: descriptor head + * @intr: interrupt register value + * @vsi_num: VSI owning the queue that the hang occurred on + */ +struct ice_health { + struct devlink_health_reporter *mdd; + struct devlink_health_reporter *tx_hang; + struct_group_tagged(ice_health_tx_hang_buf, tx_hang_buf, + struct ice_tx_ring *tx_ring; + u32 head; + u32 intr; + u16 vsi_num; + ); +}; + +void ice_health_init(struct ice_pf *pf); +void ice_health_deinit(struct ice_pf *pf); +void ice_health_clear(struct ice_pf *pf); + +void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring, + u16 vsi_num, u32 head, u32 intr); +void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num, + u16 vf_num, u8 event, u16 queue); +void ice_report_tx_hang(struct ice_pf *pf); + +#endif /* _HEALTH_H_ */ diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c b/drivers/net/ethernet/intel/ice/devlink/port.c index c6779d9dffff..767419a67fef 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c +++ b/drivers/net/ethernet/intel/ice/devlink/port.c @@ -5,7 +5,7 @@ #include "ice.h" #include "devlink.h" -#include "devlink_port.h" +#include "port.h" #include "ice_lib.h" #include "ice_fltr.h" diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink_port.h b/drivers/net/ethernet/intel/ice/devlink/port.h index d60efc340945..d60efc340945 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink_port.h +++ b/drivers/net/ethernet/intel/ice/devlink/port.h diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2f5d6f974185..71e05d30f0fd 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -78,6 +78,7 @@ #include "ice_irq.h" #include "ice_dpll.h" #include "ice_adapter.h" +#include "devlink/health.h" #define ICE_BAR0 0 #define ICE_REQ_DESC_MULTIPLE 32 @@ -665,6 +666,7 @@ struct ice_pf { struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES]; struct ice_dplls dplls; struct device *hwmon_dev; + struct ice_health health_reporters; u8 num_quanta_prof_used; }; diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 1489a8ceec51..3bf05b135b35 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -12,6 +12,13 @@ #define ICE_AQC_TOPO_MAX_LEVEL_NUM 0x9 #define ICE_AQ_SET_MAC_FRAME_SIZE_MAX 9728 +#define ICE_RXQ_CTX_SIZE_DWORDS 8 +#define ICE_RXQ_CTX_SZ (ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32)) +#define ICE_TXQ_CTX_SZ 22 + +typedef struct __packed { u8 buf[ICE_RXQ_CTX_SZ]; } ice_rxq_ctx_buf_t; +typedef struct __packed { u8 buf[ICE_TXQ_CTX_SZ]; } ice_txq_ctx_buf_t; + struct ice_aqc_generic { __le32 param0; __le32 param1; @@ -2084,10 +2091,10 @@ struct ice_aqc_add_txqs_perq { __le16 txq_id; u8 rsvd[2]; __le32 q_teid; - u8 txq_ctx[22]; + ice_txq_ctx_buf_t txq_ctx; u8 rsvd2[2]; struct ice_aqc_txsched_elem info; -}; +} __packed; /* The format of the command buffer for Add Tx LAN Queues (0x0C30) * is an array of the following structs. Please note that the length of diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 82a9cd4ec7ae..b2af8e3586f7 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -454,6 +454,9 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; + /* Enable descriptor prefetch */ + rlan_ctx.prefena = 1; + /* PF acts as uplink for switchdev; set flex descriptor with src_vsi * metadata and flags to allow redirecting to PR netdev */ @@ -910,8 +913,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring, ice_setup_tx_ctx(ring, &tlan_ctx, pf_q); /* copy context contents into the qg_buf */ qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q); - ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx, - ice_tlan_ctx_info); + ice_pack_txq_ctx(&tlan_ctx, &qg_buf->txqs[0].txq_ctx); /* init queue specific tail reg. It is referred as * transmit comm scheduler queue doorbell. diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 496d86cbd13f..f89bc6ede315 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -6,6 +6,7 @@ #include "ice_adminq_cmd.h" #include "ice_flow.h" #include "ice_ptp_hw.h" +#include <linux/packing.h> #define ICE_PF_RESET_WAIT_COUNT 300 #define ICE_MAX_NETLIST_SIZE 10 @@ -1360,39 +1361,31 @@ int ice_reset(struct ice_hw *hw, enum ice_reset_req req) } /** - * ice_copy_rxq_ctx_to_hw + * ice_copy_rxq_ctx_to_hw - Copy packed Rx queue context to HW registers * @hw: pointer to the hardware structure - * @ice_rxq_ctx: pointer to the rxq context + * @rxq_ctx: pointer to the packed Rx queue context * @rxq_index: the index of the Rx queue - * - * Copies rxq context from dense structure to HW register space */ -static int -ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, u8 *ice_rxq_ctx, u32 rxq_index) +static void ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, + const ice_rxq_ctx_buf_t *rxq_ctx, + u32 rxq_index) { - u8 i; - - if (!ice_rxq_ctx) - return -EINVAL; - - if (rxq_index > QRX_CTRL_MAX_INDEX) - return -EINVAL; - /* Copy each dword separately to HW */ - for (i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++) { - wr32(hw, QRX_CONTEXT(i, rxq_index), - *((u32 *)(ice_rxq_ctx + (i * sizeof(u32))))); + for (int i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++) { + u32 ctx = ((const u32 *)rxq_ctx)[i]; - ice_debug(hw, ICE_DBG_QCTX, "qrxdata[%d]: %08X\n", i, - *((u32 *)(ice_rxq_ctx + (i * sizeof(u32))))); - } + wr32(hw, QRX_CONTEXT(i, rxq_index), ctx); - return 0; + ice_debug(hw, ICE_DBG_QCTX, "qrxdata[%d]: %08X\n", i, ctx); + } } +#define ICE_CTX_STORE(struct_name, struct_field, width, lsb) \ + PACKED_FIELD((lsb) + (width) - 1, (lsb), struct struct_name, struct_field) + /* LAN Rx Queue Context */ -static const struct ice_ctx_ele ice_rlan_ctx_info[] = { - /* Field Width LSB */ +static const struct packed_field_u8 ice_rlan_ctx_fields[] = { + /* Field Width LSB */ ICE_CTX_STORE(ice_rlan_ctx, head, 13, 0), ICE_CTX_STORE(ice_rlan_ctx, cpuid, 8, 13), ICE_CTX_STORE(ice_rlan_ctx, base, 57, 32), @@ -1413,35 +1406,50 @@ static const struct ice_ctx_ele ice_rlan_ctx_info[] = { ICE_CTX_STORE(ice_rlan_ctx, tphhead_ena, 1, 196), ICE_CTX_STORE(ice_rlan_ctx, lrxqthresh, 3, 198), ICE_CTX_STORE(ice_rlan_ctx, prefena, 1, 201), - { 0 } }; /** - * ice_write_rxq_ctx + * ice_pack_rxq_ctx - Pack Rx queue context into a HW buffer + * @ctx: the Rx queue context to pack + * @buf: the HW buffer to pack into + * + * Pack the Rx queue context from the CPU-friendly unpacked buffer into its + * bit-packed HW layout. + */ +static void ice_pack_rxq_ctx(const struct ice_rlan_ctx *ctx, + ice_rxq_ctx_buf_t *buf) +{ + pack_fields(buf, sizeof(*buf), ctx, ice_rlan_ctx_fields, + QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST); +} + +/** + * ice_write_rxq_ctx - Write Rx Queue context to hardware * @hw: pointer to the hardware structure - * @rlan_ctx: pointer to the rxq context + * @rlan_ctx: pointer to the unpacked Rx queue context * @rxq_index: the index of the Rx queue * - * Converts rxq context from sparse to dense structure and then writes - * it to HW register space and enables the hardware to prefetch descriptors - * instead of only fetching them on demand + * Pack the sparse Rx Queue context into dense hardware format and write it + * into the HW register space. + * + * Return: 0 on success, or -EINVAL if the Rx queue index is invalid. */ int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, u32 rxq_index) { - u8 ctx_buf[ICE_RXQ_CTX_SZ] = { 0 }; + ice_rxq_ctx_buf_t buf = {}; - if (!rlan_ctx) + if (rxq_index > QRX_CTRL_MAX_INDEX) return -EINVAL; - rlan_ctx->prefena = 1; + ice_pack_rxq_ctx(rlan_ctx, &buf); + ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); - ice_set_ctx(hw, (u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info); - return ice_copy_rxq_ctx_to_hw(hw, ctx_buf, rxq_index); + return 0; } /* LAN Tx Queue Context */ -const struct ice_ctx_ele ice_tlan_ctx_info[] = { +static const struct packed_field_u8 ice_tlan_ctx_fields[] = { /* Field Width LSB */ ICE_CTX_STORE(ice_tlan_ctx, base, 57, 0), ICE_CTX_STORE(ice_tlan_ctx, port_num, 3, 57), @@ -1470,10 +1478,22 @@ const struct ice_ctx_ele ice_tlan_ctx_info[] = { ICE_CTX_STORE(ice_tlan_ctx, drop_ena, 1, 165), ICE_CTX_STORE(ice_tlan_ctx, cache_prof_idx, 2, 166), ICE_CTX_STORE(ice_tlan_ctx, pkt_shaper_prof_idx, 3, 168), - ICE_CTX_STORE(ice_tlan_ctx, int_q_state, 122, 171), - { 0 } }; +/** + * ice_pack_txq_ctx - Pack Tx queue context into a HW buffer + * @ctx: the Tx queue context to pack + * @buf: the HW buffer to pack into + * + * Pack the Tx queue context from the CPU-friendly unpacked buffer into its + * bit-packed HW layout. + */ +void ice_pack_txq_ctx(const struct ice_tlan_ctx *ctx, ice_txq_ctx_buf_t *buf) +{ + pack_fields(buf, sizeof(*buf), ctx, ice_tlan_ctx_fields, + QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST); +} + /* Sideband Queue command wrappers */ /** @@ -4558,205 +4578,6 @@ ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps, /* End of FW Admin Queue command wrappers */ /** - * ice_pack_ctx_byte - write a byte to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_byte(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u8 src_byte, dest_byte, mask; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - src_byte = *from; - src_byte <<= shift_width; - src_byte &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_byte, dest, sizeof(dest_byte)); - - dest_byte &= ~mask; /* get the bits not changing */ - dest_byte |= src_byte; /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_byte, sizeof(dest_byte)); -} - -/** - * ice_pack_ctx_word - write a word to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_word(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u16 src_word, mask; - __le16 dest_word; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_word = *(u16 *)from; - src_word <<= shift_width; - src_word &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_word, dest, sizeof(dest_word)); - - dest_word &= ~(cpu_to_le16(mask)); /* get the bits not changing */ - dest_word |= cpu_to_le16(src_word); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_word, sizeof(dest_word)); -} - -/** - * ice_pack_ctx_dword - write a dword to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_dword(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u32 src_dword, mask; - __le32 dest_dword; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_dword = *(u32 *)from; - src_dword <<= shift_width; - src_dword &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_dword, dest, sizeof(dest_dword)); - - dest_dword &= ~(cpu_to_le32(mask)); /* get the bits not changing */ - dest_dword |= cpu_to_le32(src_dword); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_dword, sizeof(dest_dword)); -} - -/** - * ice_pack_ctx_qword - write a qword to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_qword(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u64 src_qword, mask; - __le64 dest_qword; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK_ULL(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_qword = *(u64 *)from; - src_qword <<= shift_width; - src_qword &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_qword, dest, sizeof(dest_qword)); - - dest_qword &= ~(cpu_to_le64(mask)); /* get the bits not changing */ - dest_qword |= cpu_to_le64(src_qword); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_qword, sizeof(dest_qword)); -} - -/** - * ice_set_ctx - set context bits in packed structure - * @hw: pointer to the hardware structure - * @src_ctx: pointer to a generic non-packed context structure - * @dest_ctx: pointer to memory for the packed structure - * @ce_info: List of Rx context elements - */ -int ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - int f; - - for (f = 0; ce_info[f].width; f++) { - /* We have to deal with each element of the FW response - * using the correct size so that we are correct regardless - * of the endianness of the machine. - */ - if (ce_info[f].width > (ce_info[f].size_of * BITS_PER_BYTE)) { - ice_debug(hw, ICE_DBG_QCTX, "Field %d width of %d bits larger than size of %d byte(s) ... skipping write\n", - f, ce_info[f].width, ce_info[f].size_of); - continue; - } - switch (ce_info[f].size_of) { - case sizeof(u8): - ice_pack_ctx_byte(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u16): - ice_pack_ctx_word(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u32): - ice_pack_ctx_dword(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u64): - ice_pack_ctx_qword(src_ctx, dest_ctx, &ce_info[f]); - break; - default: - return -EINVAL; - } - } - - return 0; -} - -/** * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC * @hw: pointer to the HW struct * @vsi_handle: software VSI handle diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 27208a60cece..a68bea3934e3 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -92,9 +92,8 @@ ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle, bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq); int ice_aq_q_shutdown(struct ice_hw *hw, bool unloading); void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode); -extern const struct ice_ctx_ele ice_tlan_ctx_info[]; -int ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info); + +void ice_pack_txq_ctx(const struct ice_tlan_ctx *ctx, ice_txq_ctx_buf_t *buf); extern struct mutex ice_global_cfg_lock_sw; diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index ac7db100e2cd..5c7dcf21b222 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -5,7 +5,7 @@ #define _ICE_ESWITCH_H_ #include <net/devlink.h> -#include "devlink/devlink_port.h" +#include "devlink/port.h" #ifdef CONFIG_ICE_SWITCHDEV void ice_eswitch_detach_vf(struct ice_pf *pf, struct ice_vf *vf); diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 611577ebc29d..1479b45738af 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -371,29 +371,21 @@ enum ice_rx_flex_desc_status_error_1_bits { ICE_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */ }; -#define ICE_RXQ_CTX_SIZE_DWORDS 8 -#define ICE_RXQ_CTX_SZ (ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32)) #define ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS 22 #define ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS 5 #define GLTCLAN_CQ_CNTX(i, CQ) (GLTCLAN_CQ_CNTX0(CQ) + ((i) * 0x0800)) -/* RLAN Rx queue context data - * - * The sizes of the variables may be larger than needed due to crossing byte - * boundaries. If we do not have the width of the variable set to the correct - * size then we could end up shifting bits off the top of the variable when the - * variable is at the top of a byte and crosses over into the next byte. - */ +/* RLAN Rx queue context data */ struct ice_rlan_ctx { u16 head; - u16 cpuid; /* bigger than needed, see above for reason */ + u8 cpuid; #define ICE_RLAN_BASE_S 7 u64 base; u16 qlen; #define ICE_RLAN_CTX_DBUF_S 7 - u16 dbuf; /* bigger than needed, see above for reason */ + u8 dbuf; #define ICE_RLAN_CTX_HBUF_S 6 - u16 hbuf; /* bigger than needed, see above for reason */ + u8 hbuf; u8 dtype; u8 dsize; u8 crcstrip; @@ -401,29 +393,15 @@ struct ice_rlan_ctx { u8 hsplit_0; u8 hsplit_1; u8 showiv; - u32 rxmax; /* bigger than needed, see above for reason */ + u16 rxmax; u8 tphrdesc_ena; u8 tphwdesc_ena; u8 tphdata_ena; u8 tphhead_ena; - u16 lrxqthresh; /* bigger than needed, see above for reason */ + u8 lrxqthresh; u8 prefena; /* NOTE: normally must be set to 1 at init */ }; -struct ice_ctx_ele { - u16 offset; - u16 size_of; - u16 width; - u16 lsb; -}; - -#define ICE_CTX_STORE(_struct, _ele, _width, _lsb) { \ - .offset = offsetof(struct _struct, _ele), \ - .size_of = sizeof_field(struct _struct, _ele), \ - .width = _width, \ - .lsb = _lsb, \ -} - /* for hsplit_0 field of Rx RLAN context */ enum ice_rlan_ctx_rx_hsplit_0 { ICE_RLAN_RX_HSPLIT_0_NO_SPLIT = 0, @@ -551,18 +529,12 @@ enum ice_tx_ctx_desc_eipt_offload { #define ICE_LAN_TXQ_MAX_QGRPS 127 #define ICE_LAN_TXQ_MAX_QDIS 1023 -/* Tx queue context data - * - * The sizes of the variables may be larger than needed due to crossing byte - * boundaries. If we do not have the width of the variable set to the correct - * size then we could end up shifting bits off the top of the variable when the - * variable is at the top of a byte and crosses over into the next byte. - */ +/* Tx queue context data */ struct ice_tlan_ctx { #define ICE_TLAN_CTX_BASE_S 7 u64 base; /* base is defined in 128-byte units */ u8 port_num; - u16 cgd_num; /* bigger than needed, see above for reason */ + u8 cgd_num; u8 pf_num; u16 vmvf_num; u8 vmvf_type; @@ -573,7 +545,7 @@ struct ice_tlan_ctx { u8 tsyn_ena; u8 internal_usage_flag; u8 alt_vlan; - u16 cpuid; /* bigger than needed, see above for reason */ + u8 cpuid; u8 wb_mode; u8 tphrd_desc; u8 tphrd; @@ -582,7 +554,7 @@ struct ice_tlan_ctx { u16 qnum_in_func; u8 itr_notification_mode; u8 adjust_prof_id; - u32 qlen; /* bigger than needed, see above for reason */ + u16 qlen; u8 quanta_prof_idx; u8 tso_ena; u16 tso_qnum; @@ -590,7 +562,6 @@ struct ice_tlan_ctx { u8 drop_ena; u8 cache_prof_idx; u8 pkt_shaper_prof_idx; - u8 int_q_state; /* width not needed - internal - DO NOT WRITE!!! */ }; #endif /* _ICE_LAN_TX_RX_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 0ab35607e5d5..1701f7143f24 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -14,7 +14,7 @@ #include "ice_dcb_lib.h" #include "ice_dcb_nl.h" #include "devlink/devlink.h" -#include "devlink/devlink_port.h" +#include "devlink/port.h" #include "ice_sf_eth.h" #include "ice_hwmon.h" /* Including ice_trace.h with CREATE_TRACE_POINTS defined will generate the @@ -1816,6 +1816,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_tx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_TX_PQM, pf_num, vf_num, + event, queue); wr32(hw, GL_MDET_TX_PQM, 0xffffffff); } @@ -1829,6 +1831,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_tx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_TX_TCLAN, pf_num, vf_num, + event, queue); wr32(hw, GL_MDET_TX_TCLAN_BY_MAC(hw), U32_MAX); } @@ -1842,6 +1846,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_rx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_RX, pf_num, vf_num, event, + queue); wr32(hw, GL_MDET_RX, 0xffffffff); } @@ -2364,9 +2370,11 @@ static void ice_service_task(struct work_struct *work) struct ice_pf *pf = container_of(work, struct ice_pf, serv_task); unsigned long start_time = jiffies; - /* subtasks */ + if (pf->health_reporters.tx_hang_buf.tx_ring) { + ice_report_tx_hang(pf); + pf->health_reporters.tx_hang_buf.tx_ring = NULL; + } - /* process reset requests first */ ice_reset_subtask(pf); /* bail if a reset/recovery cycle is pending or rebuild failed */ @@ -5087,6 +5095,7 @@ static int ice_init_devlink(struct ice_pf *pf) return err; ice_devlink_init_regions(pf); + ice_health_init(pf); ice_devlink_register(pf); return 0; @@ -5095,6 +5104,7 @@ static int ice_init_devlink(struct ice_pf *pf) static void ice_deinit_devlink(struct ice_pf *pf) { ice_devlink_unregister(pf); + ice_health_deinit(pf); ice_devlink_destroy_regions(pf); ice_devlink_unregister_params(pf); } @@ -7793,6 +7803,8 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) /* if we get here, reset flow is successful */ clear_bit(ICE_RESET_FAILED, pf->state); + ice_health_clear(pf); + ice_plug_aux_dev(pf); if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) ice_lag_rebuild(pf); @@ -8283,16 +8295,18 @@ void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) if (tx_ring) { struct ice_hw *hw = &pf->hw; - u32 head, val = 0; + u32 head, intr = 0; head = FIELD_GET(QTX_COMM_HEAD_HEAD_M, rd32(hw, QTX_COMM_HEAD(vsi->txq_map[txqueue]))); /* Read interrupt register */ - val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx)); + intr = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx)); netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %u, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n", vsi->vsi_num, txqueue, tx_ring->next_to_clean, - head, tx_ring->next_to_use, val); + head, tx_ring->next_to_use, intr); + + ice_prep_tx_hang_report(pf, tx_ring, vsi->vsi_num, head, intr); } pf->tx_timeout_last_recovery = jiffies; diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index 970a99a52bf1..fb7a1b9a4313 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -4,7 +4,7 @@ #include "ice.h" #include "ice_eswitch.h" #include "devlink/devlink.h" -#include "devlink/devlink_port.h" +#include "devlink/port.h" #include "ice_sriov.h" #include "ice_tc_lib.h" #include "ice_dcb_lib.h" diff --git a/drivers/net/ethernet/intel/ice/ice_sf_eth.c b/drivers/net/ethernet/intel/ice/ice_sf_eth.c index 75d7147e1c01..1a2c94375ca7 100644 --- a/drivers/net/ethernet/intel/ice/ice_sf_eth.c +++ b/drivers/net/ethernet/intel/ice/ice_sf_eth.c @@ -5,8 +5,8 @@ #include "ice_txrx.h" #include "ice_fltr.h" #include "ice_sf_eth.h" -#include "devlink/devlink_port.h" #include "devlink/devlink.h" +#include "devlink/port.h" static const struct net_device_ops ice_sf_netdev_ops = { .ndo_open = ice_open, diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 660dff5426e7..83ce3bfefa5c 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -90,7 +90,6 @@ struct ltq_etop_priv { struct net_device *netdev; struct platform_device *pdev; struct ltq_eth_data *pldata; - struct resource *res; struct mii_bus *mii_bus; @@ -643,31 +642,14 @@ ltq_etop_probe(struct platform_device *pdev) { struct net_device *dev; struct ltq_etop_priv *priv; - struct resource *res; int err; int i; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "failed to get etop resource\n"); - err = -ENOENT; - goto err_out; - } - - res = devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), dev_name(&pdev->dev)); - if (!res) { - dev_err(&pdev->dev, "failed to request etop resource\n"); - err = -EBUSY; - goto err_out; - } - - ltq_etop_membase = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!ltq_etop_membase) { + ltq_etop_membase = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(ltq_etop_membase)) { dev_err(&pdev->dev, "failed to remap etop engine %d\n", pdev->id); - err = -ENOMEM; + err = PTR_ERR(ltq_etop_membase); goto err_out; } @@ -679,7 +661,6 @@ ltq_etop_probe(struct platform_device *pdev) dev->netdev_ops = <q_eth_netdev_ops; dev->ethtool_ops = <q_etop_ethtool_ops; priv = netdev_priv(dev); - priv->res = res; priv->pdev = pdev; priv->pldata = dev_get_platdata(&pdev->dev); priv->netdev = dev; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 1fb285fa0bdb..fe6261b81540 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3960,20 +3960,27 @@ static struct mvneta_port *mvneta_pcs_to_port(struct phylink_pcs *pcs) return container_of(pcs, struct mvneta_port, phylink_pcs); } -static int mvneta_pcs_validate(struct phylink_pcs *pcs, - unsigned long *supported, - const struct phylink_link_state *state) +static unsigned int mvneta_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) { - /* We only support QSGMII, SGMII, 802.3z and RGMII modes. - * When in 802.3z mode, we must have AN enabled: + /* When operating in an 802.3z mode, we must have AN enabled: * "Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... * When <PortType> = 1 (1000BASE-X) this field must be set to 1." + * Therefore, inband is "required". */ - if (phy_interface_mode_is_8023z(state->interface) && - !phylink_test(state->advertising, Autoneg)) - return -EINVAL; + if (phy_interface_mode_is_8023z(interface)) + return LINK_INBAND_ENABLE; - return 0; + /* QSGMII, SGMII and RGMII can be configured to use inband + * signalling of the AN result. Indicate these as "possible". + */ + if (interface == PHY_INTERFACE_MODE_SGMII || + interface == PHY_INTERFACE_MODE_QSGMII || + phy_interface_mode_is_rgmii(interface)) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + /* For any other modes, indicate that inband is not supported. */ + return LINK_INBAND_DISABLE; } static void mvneta_pcs_get_state(struct phylink_pcs *pcs, @@ -4071,7 +4078,7 @@ static void mvneta_pcs_an_restart(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mvneta_phylink_pcs_ops = { - .pcs_validate = mvneta_pcs_validate, + .pcs_inband_caps = mvneta_pcs_inband_caps, .pcs_get_state = mvneta_pcs_get_state, .pcs_config = mvneta_pcs_config, .pcs_an_restart = mvneta_pcs_an_restart, diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 571631a30320..f85229a30844 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6224,19 +6224,26 @@ static const struct phylink_pcs_ops mvpp2_phylink_xlg_pcs_ops = { .pcs_config = mvpp2_xlg_pcs_config, }; -static int mvpp2_gmac_pcs_validate(struct phylink_pcs *pcs, - unsigned long *supported, - const struct phylink_link_state *state) +static unsigned int mvpp2_gmac_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) { - /* When in 802.3z mode, we must have AN enabled: + /* When operating in an 802.3z mode, we must have AN enabled: * Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... * When <PortType> = 1 (1000BASE-X) this field must be set to 1. + * Therefore, inband is "required". */ - if (phy_interface_mode_is_8023z(state->interface) && - !phylink_test(state->advertising, Autoneg)) - return -EINVAL; + if (phy_interface_mode_is_8023z(interface)) + return LINK_INBAND_ENABLE; - return 0; + /* SGMII and RGMII can be configured to use inband signalling of the + * AN result. Indicate these as "possible". + */ + if (interface == PHY_INTERFACE_MODE_SGMII || + phy_interface_mode_is_rgmii(interface)) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + /* For any other modes, indicate that inband is not supported. */ + return LINK_INBAND_DISABLE; } static void mvpp2_gmac_pcs_get_state(struct phylink_pcs *pcs, @@ -6343,7 +6350,7 @@ static void mvpp2_gmac_pcs_an_restart(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mvpp2_phylink_gmac_pcs_ops = { - .pcs_validate = mvpp2_gmac_pcs_validate, + .pcs_inband_caps = mvpp2_gmac_pcs_inband_caps, .pcs_get_state = mvpp2_gmac_pcs_get_state, .pcs_config = mvpp2_gmac_pcs_config, .pcs_an_restart = mvpp2_gmac_pcs_an_restart, diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 549436efc204..3a9825883d79 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1137,6 +1137,43 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static int octep_get_vf_config(struct net_device *dev, int vf, + struct ifla_vf_info *ivi) +{ + struct octep_device *oct = netdev_priv(dev); + + ivi->vf = vf; + ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); + ivi->spoofchk = true; + ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE; + ivi->trusted = false; + + return 0; +} + +static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) +{ + struct octep_device *oct = netdev_priv(dev); + int err; + + if (!is_valid_ether_addr(mac)) { + dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); + return -EADDRNOTAVAIL; + } + + dev_dbg(&oct->pdev->dev, "set vf-%d mac to %pM\n", vf, mac); + ether_addr_copy(oct->vf_info[vf].mac_addr, mac); + oct->vf_info[vf].flags |= OCTEON_PFVF_FLAG_MAC_SET_BY_PF; + + err = octep_ctrl_net_set_mac_addr(oct, vf, mac, true); + if (err) + dev_err(&oct->pdev->dev, + "Set VF%d MAC address failed via host control Mbox\n", + vf); + + return err; +} + static const struct net_device_ops octep_netdev_ops = { .ndo_open = octep_open, .ndo_stop = octep_stop, @@ -1146,6 +1183,8 @@ static const struct net_device_ops octep_netdev_ops = { .ndo_set_mac_address = octep_set_mac, .ndo_change_mtu = octep_change_mtu, .ndo_set_features = octep_set_features, + .ndo_get_vf_config = octep_get_vf_config, + .ndo_set_vf_mac = octep_set_vf_mac }; /** diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h index fee59e0e0138..3b56916af468 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h @@ -220,6 +220,7 @@ struct octep_iface_link_info { /* The Octeon VF device specific info data structure.*/ struct octep_pfvf_info { u8 mac_addr[ETH_ALEN]; + u32 flags; u32 mbox_version; }; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index e6eb98d70f3c..ebecdd29f3bd 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -156,12 +156,23 @@ static void octep_pfvf_set_mac_addr(struct octep_device *oct, u32 vf_id, { int err; + if (oct->vf_info[vf_id].flags & OCTEON_PFVF_FLAG_MAC_SET_BY_PF) { + dev_err(&oct->pdev->dev, + "VF%d attempted to override administrative set MAC address\n", + vf_id); + rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; + return; + } + err = octep_ctrl_net_set_mac_addr(oct, vf_id, cmd.s_set_mac.mac_addr, true); if (err) { rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; - dev_err(&oct->pdev->dev, "Set VF MAC address failed via host control Mbox\n"); + dev_err(&oct->pdev->dev, "Set VF%d MAC address failed via host control Mbox\n", + vf_id); return; } + + ether_addr_copy(oct->vf_info[vf_id].mac_addr, cmd.s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -171,10 +182,18 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, { int err; + if (oct->vf_info[vf_id].flags & OCTEON_PFVF_FLAG_MAC_SET_BY_PF) { + dev_dbg(&oct->pdev->dev, "VF%d MAC address set by PF\n", vf_id); + ether_addr_copy(rsp->s_set_mac.mac_addr, + oct->vf_info[vf_id].mac_addr); + rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; + return; + } err = octep_ctrl_net_get_mac_addr(oct, vf_id, rsp->s_set_mac.mac_addr); if (err) { rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; - dev_err(&oct->pdev->dev, "Get VF MAC address failed via host control Mbox\n"); + dev_err(&oct->pdev->dev, "Get VF%d MAC address failed via host control Mbox\n", + vf_id); return; } rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h index 0dc6eead292a..386a095a99bc 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h @@ -8,8 +8,6 @@ #ifndef _OCTEP_PFVF_MBOX_H_ #define _OCTEP_PFVF_MBOX_H_ -/* VF flags */ -#define OCTEON_PFVF_FLAG_MAC_SET_BY_PF BIT_ULL(0) /* PF has set VF MAC address */ #define OCTEON_SDP_16K_HW_FRS 16380UL #define OCTEON_SDP_64K_HW_FRS 65531UL @@ -23,6 +21,10 @@ enum octep_pfvf_mbox_version { #define OCTEP_PFVF_MBOX_VERSION_CURRENT OCTEP_PFVF_MBOX_VERSION_V2 +/* VF flags */ +/* PF has set VF MAC address */ +#define OCTEON_PFVF_FLAG_MAC_SET_BY_PF BIT(0) + enum octep_pfvf_mbox_opcode { OCTEP_PFVF_MBOX_CMD_VERSION, OCTEP_PFVF_MBOX_CMD_SET_MTU, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 62c07407eb94..005ca8a056c0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -313,6 +313,10 @@ M(NIX_BANDPROF_FREE, 0x801e, nix_bandprof_free, nix_bandprof_free_req, \ msg_rsp) \ M(NIX_BANDPROF_GET_HWINFO, 0x801f, nix_bandprof_get_hwinfo, msg_req, \ nix_bandprof_get_hwinfo_rsp) \ +M(NIX_CPT_BP_ENABLE, 0x8020, nix_cpt_bp_enable, nix_bp_cfg_req, \ + nix_bp_cfg_rsp) \ +M(NIX_CPT_BP_DISABLE, 0x8021, nix_cpt_bp_disable, nix_bp_cfg_req, \ + msg_rsp) \ M(NIX_READ_INLINE_IPSEC_CFG, 0x8023, nix_read_inline_ipsec_cfg, \ msg_req, nix_inline_ipsec_cfg) \ M(NIX_MCAST_GRP_CREATE, 0x802b, nix_mcast_grp_create, nix_mcast_grp_create_req, \ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index a5d1e2bddd58..613655fcd34f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -569,9 +569,17 @@ void rvu_nix_flr_free_bpids(struct rvu *rvu, u16 pcifunc) mutex_unlock(&rvu->rsrc_lock); } -int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, - struct nix_bp_cfg_req *req, - struct msg_rsp *rsp) +static u16 nix_get_channel(u16 chan, bool cpt_link) +{ + /* CPT channel for a given link channel is always + * assumed to be BIT(11) set in link channel. + */ + return cpt_link ? chan | BIT(11) : chan; +} + +static int nix_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp, bool cpt_link) { u16 pcifunc = req->hdr.pcifunc; int blkaddr, pf, type, err; @@ -579,6 +587,7 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, struct rvu_pfvf *pfvf; struct nix_hw *nix_hw; struct nix_bp *bp; + u16 chan_v; u64 cfg; pf = rvu_get_pf(pcifunc); @@ -589,6 +598,9 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, if (is_sdp_pfvf(pcifunc)) type = NIX_INTF_TYPE_SDP; + if (cpt_link && !rvu->hw->cpt_links) + return 0; + pfvf = rvu_get_pfvf(rvu, pcifunc); err = nix_get_struct_ptrs(rvu, pcifunc, &nix_hw, &blkaddr); if (err) @@ -597,8 +609,9 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, bp = &nix_hw->bp; chan_base = pfvf->rx_chan_base + req->chan_base; for (chan = chan_base; chan < (chan_base + req->chan_cnt); chan++) { - cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan)); - rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan), + chan_v = nix_get_channel(chan, cpt_link); + cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v)); + rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v), cfg & ~BIT_ULL(16)); if (type == NIX_INTF_TYPE_LBK) { @@ -617,6 +630,20 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, return 0; } +int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp) +{ + return nix_bp_disable(rvu, req, rsp, false); +} + +int rvu_mbox_handler_nix_cpt_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp) +{ + return nix_bp_disable(rvu, req, rsp, true); +} + static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, int type, int chan_id) { @@ -696,15 +723,17 @@ static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, return bpid; } -int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, - struct nix_bp_cfg_req *req, - struct nix_bp_cfg_rsp *rsp) +static int nix_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp, + bool cpt_link) { int blkaddr, pf, type, chan_id = 0; u16 pcifunc = req->hdr.pcifunc; struct rvu_pfvf *pfvf; u16 chan_base, chan; s16 bpid, bpid_base; + u16 chan_v; u64 cfg; pf = rvu_get_pf(pcifunc); @@ -717,6 +746,9 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, type != NIX_INTF_TYPE_SDP) return 0; + if (cpt_link && !rvu->hw->cpt_links) + return 0; + pfvf = rvu_get_pfvf(rvu, pcifunc); blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc); @@ -730,9 +762,11 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, return -EINVAL; } - cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan)); + chan_v = nix_get_channel(chan, cpt_link); + + cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v)); cfg &= ~GENMASK_ULL(8, 0); - rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan), + rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v), cfg | (bpid & GENMASK_ULL(8, 0)) | BIT_ULL(16)); chan_id++; bpid = rvu_nix_get_bpid(rvu, req, type, chan_id); @@ -750,6 +784,20 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, return 0; } +int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp) +{ + return nix_bp_enable(rvu, req, rsp, false); +} + +int rvu_mbox_handler_nix_cpt_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp) +{ + return nix_bp_enable(rvu, req, rsp, true); +} + static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr, u64 format, bool v4, u64 *fidx) { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile index dbc971266865..cb6513ab35e7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile @@ -15,5 +15,6 @@ rvu_rep-y := rep.o rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o rvu_nicpf-$(CONFIG_MACSEC) += cn10k_macsec.o +rvu_nicpf-$(CONFIG_XFRM_OFFLOAD) += cn10k_ipsec.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c new file mode 100644 index 000000000000..09a5b5268205 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -0,0 +1,1056 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell IPSEC offload driver + * + * Copyright (C) 2024 Marvell. + */ + +#include <net/xfrm.h> +#include <linux/netdevice.h> +#include <linux/bitfield.h> +#include <crypto/aead.h> +#include <crypto/gcm.h> + +#include "otx2_common.h" +#include "otx2_struct.h" +#include "cn10k_ipsec.h" + +static bool is_dev_support_ipsec_offload(struct pci_dev *pdev) +{ + return is_dev_cn10ka_b0(pdev) || is_dev_cn10kb(pdev); +} + +static bool cn10k_cpt_device_set_inuse(struct otx2_nic *pf) +{ + enum cn10k_cpt_hw_state_e state; + + while (true) { + state = atomic_cmpxchg(&pf->ipsec.cpt_state, + CN10K_CPT_HW_AVAILABLE, + CN10K_CPT_HW_IN_USE); + if (state == CN10K_CPT_HW_AVAILABLE) + return true; + if (state == CN10K_CPT_HW_UNAVAILABLE) + return false; + + mdelay(1); + } +} + +static void cn10k_cpt_device_set_available(struct otx2_nic *pf) +{ + atomic_set(&pf->ipsec.cpt_state, CN10K_CPT_HW_AVAILABLE); +} + +static void cn10k_cpt_device_set_unavailable(struct otx2_nic *pf) +{ + atomic_set(&pf->ipsec.cpt_state, CN10K_CPT_HW_UNAVAILABLE); +} + +static int cn10k_outb_cptlf_attach(struct otx2_nic *pf) +{ + struct rsrc_attach *attach; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + /* Get memory to put this msg */ + attach = otx2_mbox_alloc_msg_attach_resources(&pf->mbox); + if (!attach) + goto unlock; + + attach->cptlfs = true; + attach->modify = true; + + /* Send attach request to AF */ + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static int cn10k_outb_cptlf_detach(struct otx2_nic *pf) +{ + struct rsrc_detach *detach; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + detach = otx2_mbox_alloc_msg_detach_resources(&pf->mbox); + if (!detach) + goto unlock; + + detach->partial = true; + detach->cptlfs = true; + + /* Send detach request to AF */ + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static int cn10k_outb_cptlf_alloc(struct otx2_nic *pf) +{ + struct cpt_lf_alloc_req_msg *req; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + req = otx2_mbox_alloc_msg_cpt_lf_alloc(&pf->mbox); + if (!req) + goto unlock; + + /* PF function */ + req->nix_pf_func = pf->pcifunc; + /* Enable SE-IE Engine Group */ + req->eng_grpmsk = 1 << CN10K_DEF_CPT_IPSEC_EGRP; + + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static void cn10k_outb_cptlf_free(struct otx2_nic *pf) +{ + mutex_lock(&pf->mbox.lock); + otx2_mbox_alloc_msg_cpt_lf_free(&pf->mbox); + otx2_sync_mbox_msg(&pf->mbox); + mutex_unlock(&pf->mbox.lock); +} + +static int cn10k_outb_cptlf_config(struct otx2_nic *pf) +{ + struct cpt_inline_ipsec_cfg_msg *req; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + req = otx2_mbox_alloc_msg_cpt_inline_ipsec_cfg(&pf->mbox); + if (!req) + goto unlock; + + req->dir = CPT_INLINE_OUTBOUND; + req->enable = 1; + req->nix_pf_func = pf->pcifunc; + ret = otx2_sync_mbox_msg(&pf->mbox); +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static void cn10k_outb_cptlf_iq_enable(struct otx2_nic *pf) +{ + u64 reg_val; + + /* Set Execution Enable of instruction queue */ + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + reg_val |= BIT_ULL(16); + otx2_write64(pf, CN10K_CPT_LF_INPROG, reg_val); + + /* Set iqueue's enqueuing */ + reg_val = otx2_read64(pf, CN10K_CPT_LF_CTL); + reg_val |= BIT_ULL(0); + otx2_write64(pf, CN10K_CPT_LF_CTL, reg_val); +} + +static void cn10k_outb_cptlf_iq_disable(struct otx2_nic *pf) +{ + u32 inflight, grb_cnt, gwb_cnt; + u32 nq_ptr, dq_ptr; + int timeout = 20; + u64 reg_val; + int cnt; + + /* Disable instructions enqueuing */ + otx2_write64(pf, CN10K_CPT_LF_CTL, 0ull); + + /* Wait for instruction queue to become empty. + * CPT_LF_INPROG.INFLIGHT count is zero + */ + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + inflight = FIELD_GET(CPT_LF_INPROG_INFLIGHT, reg_val); + if (!inflight) + break; + + usleep_range(10000, 20000); + if (timeout-- < 0) { + netdev_err(pf->netdev, "Timeout to cleanup CPT IQ\n"); + break; + } + } while (1); + + /* Disable executions in the LF's queue, + * the queue should be empty at this point + */ + reg_val &= ~BIT_ULL(16); + otx2_write64(pf, CN10K_CPT_LF_INPROG, reg_val); + + /* Wait for instruction queue to become empty */ + cnt = 0; + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + if (reg_val & BIT_ULL(31)) + cnt = 0; + else + cnt++; + reg_val = otx2_read64(pf, CN10K_CPT_LF_Q_GRP_PTR); + nq_ptr = FIELD_GET(CPT_LF_Q_GRP_PTR_DQ_PTR, reg_val); + dq_ptr = FIELD_GET(CPT_LF_Q_GRP_PTR_DQ_PTR, reg_val); + } while ((cnt < 10) && (nq_ptr != dq_ptr)); + + cnt = 0; + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + inflight = FIELD_GET(CPT_LF_INPROG_INFLIGHT, reg_val); + grb_cnt = FIELD_GET(CPT_LF_INPROG_GRB_CNT, reg_val); + gwb_cnt = FIELD_GET(CPT_LF_INPROG_GWB_CNT, reg_val); + if (inflight == 0 && gwb_cnt < 40 && + (grb_cnt == 0 || grb_cnt == 40)) + cnt++; + else + cnt = 0; + } while (cnt < 10); +} + +/* Allocate memory for CPT outbound Instruction queue. + * Instruction queue memory format is: + * ----------------------------- + * | Instruction Group memory | + * | (CPT_LF_Q_SIZE[SIZE_DIV40] | + * | x 16 Bytes) | + * | | + * ----------------------------- <-- CPT_LF_Q_BASE[ADDR] + * | Flow Control (128 Bytes) | + * | | + * ----------------------------- + * | Instruction Memory | + * | (CPT_LF_Q_SIZE[SIZE_DIV40] | + * | × 40 × 64 bytes) | + * | | + * ----------------------------- + */ +static int cn10k_outb_cptlf_iq_alloc(struct otx2_nic *pf) +{ + struct cn10k_cpt_inst_queue *iq = &pf->ipsec.iq; + + iq->size = CN10K_CPT_INST_QLEN_BYTES + CN10K_CPT_Q_FC_LEN + + CN10K_CPT_INST_GRP_QLEN_BYTES + OTX2_ALIGN; + + iq->real_vaddr = dma_alloc_coherent(pf->dev, iq->size, + &iq->real_dma_addr, GFP_KERNEL); + if (!iq->real_vaddr) + return -ENOMEM; + + /* iq->vaddr/dma_addr points to Flow Control location */ + iq->vaddr = iq->real_vaddr + CN10K_CPT_INST_GRP_QLEN_BYTES; + iq->dma_addr = iq->real_dma_addr + CN10K_CPT_INST_GRP_QLEN_BYTES; + + /* Align pointers */ + iq->vaddr = PTR_ALIGN(iq->vaddr, OTX2_ALIGN); + iq->dma_addr = PTR_ALIGN(iq->dma_addr, OTX2_ALIGN); + return 0; +} + +static void cn10k_outb_cptlf_iq_free(struct otx2_nic *pf) +{ + struct cn10k_cpt_inst_queue *iq = &pf->ipsec.iq; + + if (iq->real_vaddr) + dma_free_coherent(pf->dev, iq->size, iq->real_vaddr, + iq->real_dma_addr); + + iq->real_vaddr = NULL; + iq->vaddr = NULL; +} + +static int cn10k_outb_cptlf_iq_init(struct otx2_nic *pf) +{ + u64 reg_val; + int ret; + + /* Allocate Memory for CPT IQ */ + ret = cn10k_outb_cptlf_iq_alloc(pf); + if (ret) + return ret; + + /* Disable IQ */ + cn10k_outb_cptlf_iq_disable(pf); + + /* Set IQ base address */ + otx2_write64(pf, CN10K_CPT_LF_Q_BASE, pf->ipsec.iq.dma_addr); + + /* Set IQ size */ + reg_val = FIELD_PREP(CPT_LF_Q_SIZE_DIV40, CN10K_CPT_SIZE_DIV40 + + CN10K_CPT_EXTRA_SIZE_DIV40); + otx2_write64(pf, CN10K_CPT_LF_Q_SIZE, reg_val); + + return 0; +} + +static int cn10k_outb_cptlf_init(struct otx2_nic *pf) +{ + int ret; + + /* Initialize CPTLF Instruction Queue (IQ) */ + ret = cn10k_outb_cptlf_iq_init(pf); + if (ret) + return ret; + + /* Configure CPTLF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_config(pf); + if (ret) + goto iq_clean; + + /* Enable CPTLF IQ */ + cn10k_outb_cptlf_iq_enable(pf); + return 0; +iq_clean: + cn10k_outb_cptlf_iq_free(pf); + return ret; +} + +static int cn10k_outb_cpt_init(struct net_device *netdev) +{ + struct otx2_nic *pf = netdev_priv(netdev); + int ret; + + /* Attach a CPT LF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_attach(pf); + if (ret) + return ret; + + /* Allocate a CPT LF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_alloc(pf); + if (ret) + goto detach; + + /* Initialize the CPTLF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_init(pf); + if (ret) + goto lf_free; + + pf->ipsec.io_addr = (__force u64)otx2_get_regaddr(pf, + CN10K_CPT_LF_NQX(0)); + + /* Set ipsec offload enabled for this device */ + pf->flags |= OTX2_FLAG_IPSEC_OFFLOAD_ENABLED; + + cn10k_cpt_device_set_available(pf); + return 0; + +lf_free: + cn10k_outb_cptlf_free(pf); +detach: + cn10k_outb_cptlf_detach(pf); + return ret; +} + +static int cn10k_outb_cpt_clean(struct otx2_nic *pf) +{ + int ret; + + if (!cn10k_cpt_device_set_inuse(pf)) { + netdev_err(pf->netdev, "CPT LF device unavailable\n"); + return -ENODEV; + } + + /* Set ipsec offload disabled for this device */ + pf->flags &= ~OTX2_FLAG_IPSEC_OFFLOAD_ENABLED; + + /* Disable CPTLF Instruction Queue (IQ) */ + cn10k_outb_cptlf_iq_disable(pf); + + /* Set IQ base address and size to 0 */ + otx2_write64(pf, CN10K_CPT_LF_Q_BASE, 0); + otx2_write64(pf, CN10K_CPT_LF_Q_SIZE, 0); + + /* Free CPTLF IQ */ + cn10k_outb_cptlf_iq_free(pf); + + /* Free and detach CPT LF */ + cn10k_outb_cptlf_free(pf); + ret = cn10k_outb_cptlf_detach(pf); + if (ret) + netdev_err(pf->netdev, "Failed to detach CPT LF\n"); + + cn10k_cpt_device_set_unavailable(pf); + return ret; +} + +static void cn10k_cpt_inst_flush(struct otx2_nic *pf, struct cpt_inst_s *inst, + u64 size) +{ + struct otx2_lmt_info *lmt_info; + u64 val = 0, tar_addr = 0; + + lmt_info = per_cpu_ptr(pf->hw.lmt_info, smp_processor_id()); + /* FIXME: val[0:10] LMT_ID. + * [12:15] no of LMTST - 1 in the burst. + * [19:63] data size of each LMTST in the burst except first. + */ + val = (lmt_info->lmt_id & 0x7FF); + /* Target address for LMTST flush tells HW how many 128bit + * words are present. + * tar_addr[6:4] size of first LMTST - 1 in units of 128b. + */ + tar_addr |= pf->ipsec.io_addr | (((size / 16) - 1) & 0x7) << 4; + dma_wmb(); + memcpy((u64 *)lmt_info->lmt_addr, inst, size); + cn10k_lmt_flush(val, tar_addr); +} + +static int cn10k_wait_for_cpt_respose(struct otx2_nic *pf, + struct cpt_res_s *res) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(100); + u64 *completion_ptr = (u64 *)res; + + do { + if (time_after(jiffies, timeout)) { + netdev_err(pf->netdev, "CPT response timeout\n"); + return -EBUSY; + } + } while ((READ_ONCE(*completion_ptr) & CN10K_CPT_COMP_E_MASK) == + CN10K_CPT_COMP_E_NOTDONE); + + if (!(res->compcode == CN10K_CPT_COMP_E_GOOD || + res->compcode == CN10K_CPT_COMP_E_WARN) || res->uc_compcode) { + netdev_err(pf->netdev, "compcode=%x doneint=%x\n", + res->compcode, res->doneint); + netdev_err(pf->netdev, "uc_compcode=%x uc_info=%llx esn=%llx\n", + res->uc_compcode, (u64)res->uc_info, res->esn); + } + return 0; +} + +static int cn10k_outb_write_sa(struct otx2_nic *pf, struct qmem *sa_info) +{ + dma_addr_t res_iova, dptr_iova, sa_iova; + struct cn10k_tx_sa_s *sa_dptr; + struct cpt_inst_s inst = {}; + struct cpt_res_s *res; + u32 sa_size, off; + u64 *sptr, *dptr; + u64 reg_val; + int ret; + + sa_iova = sa_info->iova; + if (!sa_iova) + return -EINVAL; + + res = dma_alloc_coherent(pf->dev, sizeof(struct cpt_res_s), + &res_iova, GFP_ATOMIC); + if (!res) + return -ENOMEM; + + sa_size = sizeof(struct cn10k_tx_sa_s); + sa_dptr = dma_alloc_coherent(pf->dev, sa_size, &dptr_iova, GFP_ATOMIC); + if (!sa_dptr) { + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, + res_iova); + return -ENOMEM; + } + + sptr = (__force u64 *)sa_info->base; + dptr = (__force u64 *)sa_dptr; + for (off = 0; off < (sa_size / 8); off++) + *(dptr + off) = (__force u64)cpu_to_be64(*(sptr + off)); + + res->compcode = CN10K_CPT_COMP_E_NOTDONE; + inst.res_addr = res_iova; + inst.dptr = (u64)dptr_iova; + inst.param2 = sa_size >> 3; + inst.dlen = sa_size; + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_WRITE_SA; + inst.opcode_minor = CN10K_IPSEC_MINOR_OP_WRITE_SA; + inst.cptr = sa_iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* Check if CPT-LF available */ + if (!cn10k_cpt_device_set_inuse(pf)) { + ret = -ENODEV; + goto free_mem; + } + + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + dma_wmb(); + ret = cn10k_wait_for_cpt_respose(pf, res); + if (ret) + goto set_available; + + /* Trigger CTX flush to write dirty data back to DRAM */ + reg_val = FIELD_PREP(CPT_LF_CTX_FLUSH, sa_iova >> 7); + otx2_write64(pf, CN10K_CPT_LF_CTX_FLUSH, reg_val); + +set_available: + cn10k_cpt_device_set_available(pf); +free_mem: + dma_free_coherent(pf->dev, sa_size, sa_dptr, dptr_iova); + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, res_iova); + return ret; +} + +static int cn10k_ipsec_get_hw_ctx_offset(void) +{ + /* Offset on Hardware-context offset in word */ + return (offsetof(struct cn10k_tx_sa_s, hw_ctx) / sizeof(u64)) & 0x7F; +} + +static int cn10k_ipsec_get_ctx_push_size(void) +{ + /* Context push size is round up and in multiple of 8 Byte */ + return (roundup(offsetof(struct cn10k_tx_sa_s, hw_ctx), 8) / 8) & 0x7F; +} + +static int cn10k_ipsec_get_aes_key_len(int key_len) +{ + /* key_len is aes key length in bytes */ + switch (key_len) { + case 16: + return CN10K_IPSEC_SA_AES_KEY_LEN_128; + case 24: + return CN10K_IPSEC_SA_AES_KEY_LEN_192; + default: + return CN10K_IPSEC_SA_AES_KEY_LEN_256; + } +} + +static void cn10k_outb_prepare_sa(struct xfrm_state *x, + struct cn10k_tx_sa_s *sa_entry) +{ + int key_len = (x->aead->alg_key_len + 7) / 8; + struct net_device *netdev = x->xso.dev; + u8 *key = x->aead->alg_key; + struct otx2_nic *pf; + u32 *tmp_salt; + u64 *tmp_key; + int idx; + + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + + /* context size, 128 Byte aligned up */ + pf = netdev_priv(netdev); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->hw_ctx_off = cn10k_ipsec_get_hw_ctx_offset(); + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + + /* Ucode to skip two words of CPT_CTX_HW_S */ + sa_entry->ctx_hdr_size = 1; + + /* Allow Atomic operation (AOP) */ + sa_entry->aop_valid = 1; + + /* Outbound, ESP TRANSPORT/TUNNEL Mode, AES-GCM with */ + sa_entry->sa_dir = CN10K_IPSEC_SA_DIR_OUTB; + sa_entry->ipsec_protocol = CN10K_IPSEC_SA_IPSEC_PROTO_ESP; + sa_entry->enc_type = CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM; + sa_entry->iv_src = CN10K_IPSEC_SA_IV_SRC_PACKET; + if (x->props.mode == XFRM_MODE_TUNNEL) + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL; + else + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT; + + /* Last 4 bytes are salt */ + key_len -= 4; + sa_entry->aes_key_len = cn10k_ipsec_get_aes_key_len(key_len); + memcpy(sa_entry->cipher_key, key, key_len); + tmp_key = (u64 *)sa_entry->cipher_key; + + for (idx = 0; idx < key_len / 8; idx++) + tmp_key[idx] = (__force u64)cpu_to_be64(tmp_key[idx]); + + memcpy(&sa_entry->iv_gcm_salt, key + key_len, 4); + tmp_salt = (u32 *)&sa_entry->iv_gcm_salt; + *tmp_salt = (__force u32)cpu_to_be32(*tmp_salt); + + /* Write SA context data to memory before enabling */ + wmb(); + + /* Enable SA */ + sa_entry->sa_valid = 1; +} + +static int cn10k_ipsec_validate_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->props.aalgo != SADB_AALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload authenticated xfrm states"); + return -EINVAL; + } + if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) { + NL_SET_ERR_MSG_MOD(extack, + "Only AES-GCM-ICV16 xfrm state may be offloaded"); + return -EINVAL; + } + if (x->props.calgo != SADB_X_CALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload compressed xfrm states"); + return -EINVAL; + } + if (x->props.flags & XFRM_STATE_ESN) { + NL_SET_ERR_MSG_MOD(extack, "Cannot offload ESN xfrm states"); + return -EINVAL; + } + if (x->props.family != AF_INET && x->props.family != AF_INET6) { + NL_SET_ERR_MSG_MOD(extack, + "Only IPv4/v6 xfrm states may be offloaded"); + return -EINVAL; + } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload other than crypto-mode"); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG_MOD(extack, + "Only tunnel/transport xfrm states may be offloaded"); + return -EINVAL; + } + if (x->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG_MOD(extack, + "Only ESP xfrm state may be offloaded"); + return -EINVAL; + } + if (x->encap) { + NL_SET_ERR_MSG_MOD(extack, + "Encapsulated xfrm state may not be offloaded"); + return -EINVAL; + } + if (!x->aead) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without aead"); + return -EINVAL; + } + + if (x->aead->alg_icv_len != 128) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD ICV length other than 128bit"); + return -EINVAL; + } + if (x->aead->alg_key_len != 128 + 32 && + x->aead->alg_key_len != 192 + 32 && + x->aead->alg_key_len != 256 + 32) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD key length other than 128/192/256bit"); + return -EINVAL; + } + if (x->tfcpad) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with tfc padding"); + return -EINVAL; + } + if (!x->geniv) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without geniv"); + return -EINVAL; + } + if (strcmp(x->geniv, "seqiv")) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with geniv other than seqiv"); + return -EINVAL; + } + return 0; +} + +static int cn10k_ipsec_inb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG_MOD(extack, "xfrm inbound offload not supported"); + return -EOPNOTSUPP; +} + +static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + err = cn10k_ipsec_validate_state(x, extack); + if (err) + return err; + + pf = netdev_priv(netdev); + + err = qmem_alloc(pf->dev, &sa_info, pf->ipsec.sa_size, OTX2_ALIGN); + if (err) + return err; + + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + cn10k_outb_prepare_sa(x, sa_entry); + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error writing outbound SA"); + qmem_free(pf->dev, sa_info); + return err; + } + + x->xso.offload_handle = (unsigned long)sa_info; + /* Enable static branch when first SA setup */ + if (!pf->ipsec.outb_sa_count) + static_branch_enable(&cn10k_ipsec_sa_enabled); + pf->ipsec.outb_sa_count++; + return 0; +} + +static int cn10k_ipsec_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return cn10k_ipsec_inb_add_state(x, extack); + else + return cn10k_ipsec_outb_add_state(x, extack); +} + +static void cn10k_ipsec_del_state(struct xfrm_state *x) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return; + + pf = netdev_priv(netdev); + + sa_info = (struct qmem *)x->xso.offload_handle; + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + /* Disable SA in CPT h/w */ + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->aop_valid = 1; + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) + netdev_err(netdev, "Error (%d) deleting SA\n", err); + + x->xso.offload_handle = 0; + qmem_free(pf->dev, sa_info); + + /* If no more SA's then update netdev feature for potential change + * in NETIF_F_HW_ESP. + */ + if (!--pf->ipsec.outb_sa_count) + queue_work(pf->ipsec.sa_workq, &pf->ipsec.sa_work); +} + +static bool cn10k_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + if (x->props.family == AF_INET) { + /* Offload with IPv4 options is not supported yet */ + if (ip_hdr(skb)->ihl > 5) + return false; + } else { + /* Offload with IPv6 extension headers is not support yet */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + } + return true; +} + +static const struct xfrmdev_ops cn10k_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = cn10k_ipsec_add_state, + .xdo_dev_state_delete = cn10k_ipsec_del_state, + .xdo_dev_offload_ok = cn10k_ipsec_offload_ok, +}; + +static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) +{ + struct cn10k_ipsec *ipsec = container_of(work, struct cn10k_ipsec, + sa_work); + struct otx2_nic *pf = container_of(ipsec, struct otx2_nic, ipsec); + + /* Disable static branch when no more SA enabled */ + static_branch_disable(&cn10k_ipsec_sa_enabled); + rtnl_lock(); + netdev_update_features(pf->netdev); + rtnl_unlock(); +} + +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) +{ + struct otx2_nic *pf = netdev_priv(netdev); + + /* IPsec offload supported on cn10k */ + if (!is_dev_support_ipsec_offload(pf->pdev)) + return -EOPNOTSUPP; + + /* Initialize CPT for outbound ipsec offload */ + if (enable) + return cn10k_outb_cpt_init(netdev); + + /* Don't do CPT cleanup if SA installed */ + if (pf->ipsec.outb_sa_count) { + netdev_err(pf->netdev, "SA installed on this device\n"); + return -EBUSY; + } + + return cn10k_outb_cpt_clean(pf); +} + +int cn10k_ipsec_init(struct net_device *netdev) +{ + struct otx2_nic *pf = netdev_priv(netdev); + u32 sa_size; + + if (!is_dev_support_ipsec_offload(pf->pdev)) + return 0; + + /* Each SA entry size is 128 Byte round up in size */ + sa_size = sizeof(struct cn10k_tx_sa_s) % OTX2_ALIGN ? + (sizeof(struct cn10k_tx_sa_s) / OTX2_ALIGN + 1) * + OTX2_ALIGN : sizeof(struct cn10k_tx_sa_s); + pf->ipsec.sa_size = sa_size; + + INIT_WORK(&pf->ipsec.sa_work, cn10k_ipsec_sa_wq_handler); + pf->ipsec.sa_workq = alloc_workqueue("cn10k_ipsec_sa_workq", 0, 0); + if (!pf->ipsec.sa_workq) { + netdev_err(pf->netdev, "SA alloc workqueue failed\n"); + return -ENOMEM; + } + + /* Set xfrm device ops */ + netdev->xfrmdev_ops = &cn10k_ipsec_xfrmdev_ops; + netdev->hw_features |= NETIF_F_HW_ESP; + netdev->hw_enc_features |= NETIF_F_HW_ESP; + + cn10k_cpt_device_set_unavailable(pf); + return 0; +} +EXPORT_SYMBOL(cn10k_ipsec_init); + +void cn10k_ipsec_clean(struct otx2_nic *pf) +{ + if (!is_dev_support_ipsec_offload(pf->pdev)) + return; + + if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) + return; + + if (pf->ipsec.sa_workq) { + destroy_workqueue(pf->ipsec.sa_workq); + pf->ipsec.sa_workq = NULL; + } + + cn10k_outb_cpt_clean(pf); +} +EXPORT_SYMBOL(cn10k_ipsec_clean); + +static u16 cn10k_ipsec_get_ip_data_len(struct xfrm_state *x, + struct sk_buff *skb) +{ + struct ipv6hdr *ipv6h; + struct iphdr *iph; + u8 *src; + + src = (u8 *)skb->data + ETH_HLEN; + + if (x->props.family == AF_INET) { + iph = (struct iphdr *)src; + return ntohs(iph->tot_len); + } + + ipv6h = (struct ipv6hdr *)src; + return ntohs(ipv6h->payload_len) + sizeof(struct ipv6hdr); +} + +/* Prepare CPT and NIX SQE scatter/gather subdescriptor structure. + * SG of NIX and CPT are same in size. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + struct cpt_sg_s *cpt_sg = NULL; + struct nix_sqe_sg_s *sg = NULL; + u64 dma_addr, *iova = NULL; + u64 *cpt_iova = NULL; + u16 *sg_lens = NULL; + int seg, len; + + sq->sg[sq->head].num_segs = 0; + cpt_sg = (struct cpt_sg_s *)(sq->sqe_base - sq->sqe_size); + + for (seg = 0; seg < num_segs; seg++) { + if ((seg % MAX_SEGS_PER_SG) == 0) { + sg = (struct nix_sqe_sg_s *)(sq->sqe_base + *offset); + sg->ld_type = NIX_SEND_LDTYPE_LDD; + sg->subdc = NIX_SUBDC_SG; + sg->segs = 0; + sg_lens = (void *)sg; + iova = (void *)sg + sizeof(*sg); + /* Next subdc always starts at a 16byte boundary. + * So if sg->segs is whether 2 or 3, offset += 16bytes. + */ + if ((num_segs - seg) >= (MAX_SEGS_PER_SG - 1)) + *offset += sizeof(*sg) + (3 * sizeof(u64)); + else + *offset += sizeof(*sg) + sizeof(u64); + + cpt_sg += (seg / MAX_SEGS_PER_SG) * 4; + cpt_iova = (void *)cpt_sg + sizeof(*cpt_sg); + } + dma_addr = otx2_dma_map_skb_frag(pfvf, skb, seg, &len); + if (dma_mapping_error(pfvf->dev, dma_addr)) + return false; + + sg_lens[seg % MAX_SEGS_PER_SG] = len; + sg->segs++; + *iova++ = dma_addr; + *cpt_iova++ = dma_addr; + + /* Save DMA mapping info for later unmapping */ + sq->sg[sq->head].dma_addr[seg] = dma_addr; + sq->sg[sq->head].size[seg] = len; + sq->sg[sq->head].num_segs++; + + *cpt_sg = *(struct cpt_sg_s *)sg; + cpt_sg->rsvd_63_50 = 0; + } + + sq->sg[sq->head].skb = (u64)skb; + return true; +} + +static u16 cn10k_ipsec_get_param1(u8 iv_offset) +{ + u16 param1_val; + + /* Set Crypto mode, disable L3/L4 checksum */ + param1_val = CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM | + CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM; + param1_val |= (u16)iv_offset << CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT; + return param1_val; +} + +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + struct cpt_inst_s inst; + struct cpt_res_s *res; + struct xfrm_state *x; + struct qmem *sa_info; + dma_addr_t dptr_iova; + struct sec_path *sp; + u8 encap_offset; + u8 auth_offset; + u8 gthr_size; + u8 iv_offset; + u16 dlen; + + /* Check for IPSEC offload enabled */ + if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) + goto drop; + + sp = skb_sec_path(skb); + if (unlikely(!sp->len)) + goto drop; + + x = xfrm_input_state(skb); + if (unlikely(!x)) + goto drop; + + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) + goto drop; + + dlen = cn10k_ipsec_get_ip_data_len(x, skb); + if (dlen == 0 && netif_msg_tx_err(pf)) { + netdev_err(pf->netdev, "Invalid IP header, ip-length zero\n"); + goto drop; + } + + /* Check for valid SA context */ + sa_info = (struct qmem *)x->xso.offload_handle; + if (!sa_info) + goto drop; + + memset(&inst, 0, sizeof(struct cpt_inst_s)); + + /* Get authentication offset */ + if (x->props.family == AF_INET) + auth_offset = sizeof(struct iphdr); + else + auth_offset = sizeof(struct ipv6hdr); + + /* IV offset is after ESP header */ + iv_offset = auth_offset + sizeof(struct ip_esp_hdr); + /* Encap will start after IV */ + encap_offset = iv_offset + GCM_RFC4106_IV_SIZE; + + /* CPT Instruction word-1 */ + res = (struct cpt_res_s *)(sq->cpt_resp->base + (64 * sq->head)); + res->compcode = 0; + inst.res_addr = sq->cpt_resp->iova + (64 * sq->head); + + /* CPT Instruction word-2 */ + inst.rvu_pf_func = pf->pcifunc; + + /* CPT Instruction word-3: + * Set QORD to force CPT_RES_S write completion + */ + inst.qord = 1; + + /* CPT Instruction word-4 */ + /* inst.dlen should not include ICV length */ + inst.dlen = dlen + ETH_HLEN - (x->aead->alg_icv_len / 8); + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC; + inst.param1 = cn10k_ipsec_get_param1(iv_offset); + + inst.param2 = encap_offset << + CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT; + inst.param2 |= (u16)auth_offset << + CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT; + + /* CPT Instruction word-5 */ + gthr_size = num_segs / MAX_SEGS_PER_SG; + gthr_size = (num_segs % MAX_SEGS_PER_SG) ? gthr_size + 1 : gthr_size; + + gthr_size &= 0xF; + dptr_iova = (sq->sqe_ring->iova + (sq->head * (sq->sqe_size * 2))); + inst.dptr = dptr_iova | ((u64)gthr_size << 60); + + /* CPT Instruction word-6 */ + inst.rptr = inst.dptr; + + /* CPT Instruction word-7 */ + inst.cptr = sa_info->iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* CPT Instruction word-0 */ + inst.nixtxl = (size / 16) - 1; + inst.dat_offset = ETH_HLEN; + inst.nixtx_offset = sq->sqe_size; + + netdev_tx_sent_queue(txq, skb->len); + + /* Finally Flush the CPT instruction */ + sq->head++; + sq->head &= (sq->sqe_cnt - 1); + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + return true; +drop: + dev_kfree_skb_any(skb); + return false; +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h new file mode 100644 index 000000000000..9965df0faa3e --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell IPSEC offload driver + * + * Copyright (C) 2024 Marvell. + */ + +#ifndef CN10K_IPSEC_H +#define CN10K_IPSEC_H + +#include <linux/types.h> + +DECLARE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + +/* CPT instruction size in bytes */ +#define CN10K_CPT_INST_SIZE 64 + +/* CPT instruction (CPT_INST_S) queue length */ +#define CN10K_CPT_INST_QLEN 8200 + +/* CPT instruction queue size passed to HW is in units of + * 40*CPT_INST_S messages. + */ +#define CN10K_CPT_SIZE_DIV40 (CN10K_CPT_INST_QLEN / 40) + +/* CPT needs 320 free entries */ +#define CN10K_CPT_INST_QLEN_EXTRA_BYTES (320 * CN10K_CPT_INST_SIZE) +#define CN10K_CPT_EXTRA_SIZE_DIV40 (320 / 40) + +/* CPT instruction queue length in bytes */ +#define CN10K_CPT_INST_QLEN_BYTES \ + ((CN10K_CPT_SIZE_DIV40 * 40 * CN10K_CPT_INST_SIZE) + \ + CN10K_CPT_INST_QLEN_EXTRA_BYTES) + +/* CPT instruction group queue length in bytes */ +#define CN10K_CPT_INST_GRP_QLEN_BYTES \ + ((CN10K_CPT_SIZE_DIV40 + CN10K_CPT_EXTRA_SIZE_DIV40) * 16) + +/* CPT FC length in bytes */ +#define CN10K_CPT_Q_FC_LEN 128 + +/* Default CPT engine group for ipsec offload */ +#define CN10K_DEF_CPT_IPSEC_EGRP 1 + +/* CN10K CPT LF registers */ +#define CPT_LFBASE (BLKTYPE_CPT << RVU_FUNC_BLKADDR_SHIFT) +#define CN10K_CPT_LF_CTL (CPT_LFBASE | 0x10) +#define CN10K_CPT_LF_INPROG (CPT_LFBASE | 0x40) +#define CN10K_CPT_LF_Q_BASE (CPT_LFBASE | 0xf0) +#define CN10K_CPT_LF_Q_SIZE (CPT_LFBASE | 0x100) +#define CN10K_CPT_LF_Q_INST_PTR (CPT_LFBASE | 0x110) +#define CN10K_CPT_LF_Q_GRP_PTR (CPT_LFBASE | 0x120) +#define CN10K_CPT_LF_NQX(a) (CPT_LFBASE | 0x400 | (a) << 3) +#define CN10K_CPT_LF_CTX_FLUSH (CPT_LFBASE | 0x510) + +/* IPSEC Instruction opcodes */ +#define CN10K_IPSEC_MAJOR_OP_WRITE_SA 0x01UL +#define CN10K_IPSEC_MINOR_OP_WRITE_SA 0x09UL +#define CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC 0x2AUL + +enum cn10k_cpt_comp_e { + CN10K_CPT_COMP_E_NOTDONE = 0x00, + CN10K_CPT_COMP_E_GOOD = 0x01, + CN10K_CPT_COMP_E_FAULT = 0x02, + CN10K_CPT_COMP_E_HWERR = 0x04, + CN10K_CPT_COMP_E_INSTERR = 0x05, + CN10K_CPT_COMP_E_WARN = 0x06, + CN10K_CPT_COMP_E_MASK = 0x3F +}; + +struct cn10k_cpt_inst_queue { + u8 *vaddr; + u8 *real_vaddr; + dma_addr_t dma_addr; + dma_addr_t real_dma_addr; + u32 size; +}; + +enum cn10k_cpt_hw_state_e { + CN10K_CPT_HW_UNAVAILABLE, + CN10K_CPT_HW_AVAILABLE, + CN10K_CPT_HW_IN_USE +}; + +struct cn10k_ipsec { + /* Outbound CPT */ + u64 io_addr; + atomic_t cpt_state; + struct cn10k_cpt_inst_queue iq; + + /* SA info */ + u32 sa_size; + u32 outb_sa_count; + struct work_struct sa_work; + struct workqueue_struct *sa_workq; +}; + +/* CN10K IPSEC Security Association (SA) */ +/* SA direction */ +#define CN10K_IPSEC_SA_DIR_INB 0 +#define CN10K_IPSEC_SA_DIR_OUTB 1 +/* SA protocol */ +#define CN10K_IPSEC_SA_IPSEC_PROTO_AH 0 +#define CN10K_IPSEC_SA_IPSEC_PROTO_ESP 1 +/* SA Encryption Type */ +#define CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM 5 +/* SA IPSEC mode Transport/Tunnel */ +#define CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT 0 +#define CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL 1 +/* SA AES Key Length */ +#define CN10K_IPSEC_SA_AES_KEY_LEN_128 1 +#define CN10K_IPSEC_SA_AES_KEY_LEN_192 2 +#define CN10K_IPSEC_SA_AES_KEY_LEN_256 3 +/* IV Source */ +#define CN10K_IPSEC_SA_IV_SRC_COUNTER 0 +#define CN10K_IPSEC_SA_IV_SRC_PACKET 3 + +struct cn10k_tx_sa_s { + u64 esn_en : 1; /* W0 */ + u64 rsvd_w0_1_8 : 8; + u64 hw_ctx_off : 7; + u64 ctx_id : 16; + u64 rsvd_w0_32_47 : 16; + u64 ctx_push_size : 7; + u64 rsvd_w0_55 : 1; + u64 ctx_hdr_size : 2; + u64 aop_valid : 1; + u64 rsvd_w0_59 : 1; + u64 ctx_size : 4; + u64 w1; /* W1 */ + u64 sa_valid : 1; /* W2 */ + u64 sa_dir : 1; + u64 rsvd_w2_2_3 : 2; + u64 ipsec_mode : 1; + u64 ipsec_protocol : 1; + u64 aes_key_len : 2; + u64 enc_type : 3; + u64 rsvd_w2_11_19 : 9; + u64 iv_src : 2; + u64 rsvd_w2_22_31 : 10; + u64 rsvd_w2_32_63 : 32; + u64 w3; /* W3 */ + u8 cipher_key[32]; /* W4 - W7 */ + u32 rsvd_w8_0_31; /* W8 : IV */ + u32 iv_gcm_salt; + u64 rsvd_w9_w30[22]; /* W9 - W30 */ + u64 hw_ctx[6]; /* W31 - W36 */ +}; + +/* CPT instruction parameter-1 */ +#define CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM 0x1 +#define CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM 0x2 +#define CN10K_IPSEC_INST_PARAM1_CRYPTO_MODE 0x20 +#define CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT 8 + +/* CPT instruction parameter-2 */ +#define CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT 0 +#define CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT 8 + +/* CPT Instruction Structure */ +struct cpt_inst_s { + u64 nixtxl : 3; /* W0 */ + u64 doneint : 1; + u64 rsvd_w0_4_15 : 12; + u64 dat_offset : 8; + u64 ext_param1 : 8; + u64 nixtx_offset : 20; + u64 rsvd_w0_52_63 : 12; + u64 res_addr; /* W1 */ + u64 tag : 32; /* W2 */ + u64 tt : 2; + u64 grp : 10; + u64 rsvd_w2_44_47 : 4; + u64 rvu_pf_func : 16; + u64 qord : 1; /* W3 */ + u64 rsvd_w3_1_2 : 2; + u64 wqe_ptr : 61; + u64 dlen : 16; /* W4 */ + u64 param2 : 16; + u64 param1 : 16; + u64 opcode_major : 8; + u64 opcode_minor : 8; + u64 dptr; /* W5 */ + u64 rptr; /* W6 */ + u64 cptr : 60; /* W7 */ + u64 ctx_val : 1; + u64 egrp : 3; +}; + +/* CPT Instruction Result Structure */ +struct cpt_res_s { + u64 compcode : 7; /* W0 */ + u64 doneint : 1; + u64 uc_compcode : 8; + u64 uc_info : 48; + u64 esn; /* W1 */ +}; + +/* CPT SG structure */ +struct cpt_sg_s { + u64 seg1_size : 16; + u64 seg2_size : 16; + u64 seg3_size : 16; + u64 segs : 2; + u64 rsvd_63_50 : 14; +}; + +/* CPT LF_INPROG Register */ +#define CPT_LF_INPROG_INFLIGHT GENMASK_ULL(8, 0) +#define CPT_LF_INPROG_GRB_CNT GENMASK_ULL(39, 32) +#define CPT_LF_INPROG_GWB_CNT GENMASK_ULL(47, 40) + +/* CPT LF_Q_GRP_PTR Register */ +#define CPT_LF_Q_GRP_PTR_DQ_PTR GENMASK_ULL(14, 0) +#define CPT_LF_Q_GRP_PTR_NQ_PTR GENMASK_ULL(46, 32) + +/* CPT LF_Q_SIZE Register */ +#define CPT_LF_Q_BASE_ADDR GENMASK_ULL(52, 7) + +/* CPT LF_Q_SIZE Register */ +#define CPT_LF_Q_SIZE_DIV40 GENMASK_ULL(14, 0) + +/* CPT LF CTX Flush Register */ +#define CPT_LF_CTX_FLUSH GENMASK_ULL(45, 0) + +#ifdef CONFIG_XFRM_OFFLOAD +int cn10k_ipsec_init(struct net_device *netdev); +void cn10k_ipsec_clean(struct otx2_nic *pf); +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable); +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset); +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size); +#else +static inline __maybe_unused int cn10k_ipsec_init(struct net_device *netdev) +{ + return 0; +} + +static inline __maybe_unused void cn10k_ipsec_clean(struct otx2_nic *pf) +{ +} + +static inline __maybe_unused +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) +{ + return 0; +} + +static inline bool __maybe_unused +otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + return true; +} + +static inline bool __maybe_unused +cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + return true; +} +#endif +#endif // CN10K_IPSEC_H diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 523ecb798a7a..2b49bfec7869 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -10,12 +10,19 @@ #include <net/page_pool/helpers.h> #include <net/tso.h> #include <linux/bitfield.h> +#include <linux/dcbnl.h> +#include <net/xfrm.h> #include "otx2_reg.h" #include "otx2_common.h" #include "otx2_struct.h" #include "cn10k.h" +static bool otx2_is_pfc_enabled(struct otx2_nic *pfvf) +{ + return IS_ENABLED(CONFIG_DCB) && !!pfvf->pfc_en; +} + static void otx2_nix_rq_op_stats(struct queue_stats *stats, struct otx2_nic *pfvf, int qidx) { @@ -964,6 +971,29 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) if (err) return err; + /* Allocate memory for NIX SQE (which includes NIX SG) and CPT SG. + * SG of NIX and CPT are same in size. Allocate memory for CPT SG + * same as NIX SQE for base address alignment. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ + err = qmem_alloc(pfvf->dev, &sq->sqe_ring, qset->sqe_cnt, + sq->sqe_size * 2); + if (err) + return err; + + err = qmem_alloc(pfvf->dev, &sq->cpt_resp, qset->sqe_cnt, 64); + if (err) + return err; + if (qidx < pfvf->hw.tx_queues) { err = qmem_alloc(pfvf->dev, &sq->tso_hdrs, qset->sqe_cnt, TSO_HEADER_SIZE); @@ -1722,18 +1752,43 @@ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable) return -ENOMEM; req->chan_base = 0; -#ifdef CONFIG_DCB - req->chan_cnt = pfvf->pfc_en ? IEEE_8021QAZ_MAX_TCS : 1; - req->bpid_per_chan = pfvf->pfc_en ? 1 : 0; -#else - req->chan_cnt = 1; - req->bpid_per_chan = 0; -#endif + if (otx2_is_pfc_enabled(pfvf)) { + req->chan_cnt = IEEE_8021QAZ_MAX_TCS; + req->bpid_per_chan = 1; + } else { + req->chan_cnt = 1; + req->bpid_per_chan = 0; + } return otx2_sync_mbox_msg(&pfvf->mbox); } EXPORT_SYMBOL(otx2_nix_config_bp); +int otx2_nix_cpt_config_bp(struct otx2_nic *pfvf, bool enable) +{ + struct nix_bp_cfg_req *req; + + if (enable) + req = otx2_mbox_alloc_msg_nix_cpt_bp_enable(&pfvf->mbox); + else + req = otx2_mbox_alloc_msg_nix_cpt_bp_disable(&pfvf->mbox); + + if (!req) + return -ENOMEM; + + req->chan_base = 0; + if (otx2_is_pfc_enabled(pfvf)) { + req->chan_cnt = IEEE_8021QAZ_MAX_TCS; + req->bpid_per_chan = 1; + } else { + req->chan_cnt = 1; + req->bpid_per_chan = 0; + } + + return otx2_sync_mbox_msg(&pfvf->mbox); +} +EXPORT_SYMBOL(otx2_nix_cpt_config_bp); + /* Mbox message handlers */ void mbox_handler_cgx_stats(struct otx2_nic *pfvf, struct cgx_stats_rsp *rsp) @@ -1947,3 +2002,48 @@ EXPORT_SYMBOL(otx2_mbox_up_handler_ ## _fn_name); MBOX_UP_CGX_MESSAGES MBOX_UP_MCS_MESSAGES #undef M + +dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, + struct sk_buff *skb, int seg, int *len) +{ + enum dma_data_direction dir = DMA_TO_DEVICE; + const skb_frag_t *frag; + struct page *page; + int offset; + + /* Crypto hardware need write permission for ipsec crypto offload */ + if (unlikely(xfrm_offload(skb))) { + dir = DMA_BIDIRECTIONAL; + skb = skb_unshare(skb, GFP_ATOMIC); + } + + /* First segment is always skb->data */ + if (!seg) { + page = virt_to_page(skb->data); + offset = offset_in_page(skb->data); + *len = skb_headlen(skb); + } else { + frag = &skb_shinfo(skb)->frags[seg - 1]; + page = skb_frag_page(frag); + offset = skb_frag_off(frag); + *len = skb_frag_size(frag); + } + return otx2_dma_map_page(pfvf, page, offset, *len, dir); +} + +void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg) +{ + enum dma_data_direction dir = DMA_TO_DEVICE; + struct sk_buff *skb = NULL; + int seg; + + skb = (struct sk_buff *)sg->skb; + if (unlikely(xfrm_offload(skb))) + dir = DMA_BIDIRECTIONAL; + + for (seg = 0; seg < sg->num_segs; seg++) { + otx2_dma_unmap_page(pfvf, sg->dma_addr[seg], + sg->size[seg], dir); + } + sg->num_segs = 0; +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 566848663fea..65814e3dc93f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -30,6 +30,7 @@ #include <rvu_trace.h> #include "qos.h" #include "rep.h" +#include "cn10k_ipsec.h" /* IPv4 flag more fragment bit */ #define IPV4_FLAG_MORE 0x20 @@ -40,6 +41,7 @@ #define PCI_DEVID_OCTEONTX2_RVU_AFVF 0xA0F8 #define PCI_SUBSYS_DEVID_96XX_RVU_PFVF 0xB200 +#define PCI_SUBSYS_DEVID_CN10K_A_RVU_PFVF 0xB900 #define PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF 0xBD00 #define PCI_DEVID_OCTEONTX2_SDP_REP 0xA0F7 @@ -55,6 +57,9 @@ #define NIX_PF_PFC_PRIO_MAX 8 #endif +/* Number of segments per SG structure */ +#define MAX_SEGS_PER_SG 3 + enum arua_mapped_qtypes { AURA_NIX_RQ, AURA_NIX_SQ, @@ -448,6 +453,7 @@ struct otx2_nic { #define OTX2_FLAG_TC_MARK_ENABLED BIT_ULL(17) #define OTX2_FLAG_REP_MODE_ENABLED BIT_ULL(18) #define OTX2_FLAG_PORT_UP BIT_ULL(19) +#define OTX2_FLAG_IPSEC_OFFLOAD_ENABLED BIT_ULL(20) u64 flags; u64 *cq_op_addr; @@ -499,9 +505,9 @@ struct otx2_nic { /* Devlink */ struct otx2_devlink *dl; -#ifdef CONFIG_DCB /* PFC */ u8 pfc_en; +#ifdef CONFIG_DCB u8 *queue_to_pfc_map; u16 pfc_schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC]; bool pfc_alloc_status[NIX_PF_PFC_PRIO_MAX]; @@ -522,6 +528,9 @@ struct otx2_nic { u16 rep_pf_map[RVU_MAX_REP]; u16 esw_mode; #endif + + /* Inline ipsec */ + struct cn10k_ipsec ipsec; }; static inline bool is_otx2_lbkvf(struct pci_dev *pdev) @@ -572,6 +581,15 @@ static inline bool is_dev_cn10kb(struct pci_dev *pdev) return pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF; } +static inline bool is_dev_cn10ka_b0(struct pci_dev *pdev) +{ + if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_A_RVU_PFVF && + (pdev->revision & 0xFF) == 0x54) + return true; + + return false; +} + static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf) { struct otx2_hw *hw = &pfvf->hw; @@ -621,6 +639,9 @@ static inline void __iomem *otx2_get_regaddr(struct otx2_nic *nic, u64 offset) case BLKTYPE_NPA: blkaddr = BLKADDR_NPA; break; + case BLKTYPE_CPT: + blkaddr = BLKADDR_CPT0; + break; default: blkaddr = BLKADDR_RVUM; break; @@ -985,6 +1006,7 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable); void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); +int otx2_nix_cpt_config_bp(struct otx2_nic *pfvf, bool enable); void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, int qidx); void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura); @@ -1149,4 +1171,8 @@ static inline int mcam_entry_cmp(const void *a, const void *b) { return *(u16 *)a - *(u16 *)b; } + +dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, + struct sk_buff *skb, int seg, int *len); +void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg); #endif /* OTX2_COMMON_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c index 294fba58b670..f110dfa42360 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c @@ -435,6 +435,9 @@ process_pfc: return err; } + /* Default disable backpressure on NIX-CPT */ + otx2_nix_cpt_config_bp(pfvf, false); + /* Request Per channel Bpids */ if (pfc->pfc_en) otx2_nix_config_bp(pfvf, true); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index e310f99b1736..e1dde93e8af8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -26,6 +26,7 @@ #include "cn10k.h" #include "qos.h" #include <rvu_trace.h> +#include "cn10k_ipsec.h" #define DRV_NAME "rvu_nicpf" #define DRV_STRING "Marvell RVU NIC Physical Function Driver" @@ -1484,6 +1485,8 @@ static void otx2_free_sq_res(struct otx2_nic *pf) if (!sq->sqe) continue; qmem_free(pf->dev, sq->sqe); + qmem_free(pf->dev, sq->sqe_ring); + qmem_free(pf->dev, sq->cpt_resp); qmem_free(pf->dev, sq->tso_hdrs); kfree(sq->sg); kfree(sq->sqb_ptrs); @@ -1551,6 +1554,9 @@ int otx2_init_hw_resources(struct otx2_nic *pf) if (err) goto err_free_npa_lf; + /* Default disable backpressure on NIX-CPT */ + otx2_nix_cpt_config_bp(pf, false); + /* Enable backpressure for CGX mapped PF/VFs */ if (!is_otx2_lbkvf(pf->pdev)) otx2_nix_config_bp(pf, true); @@ -2273,6 +2279,10 @@ static int otx2_set_features(struct net_device *netdev, return otx2_enable_rxvlan(pf, features & NETIF_F_HW_VLAN_CTAG_RX); + if (changed & NETIF_F_HW_ESP) + return cn10k_ipsec_ethtool_init(netdev, + features & NETIF_F_HW_ESP); + return otx2_handle_ntuple_tc_features(netdev, features); } @@ -3162,10 +3172,14 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) /* reset CGX/RPM MAC stats */ otx2_reset_mac_stats(pf); + err = cn10k_ipsec_init(netdev); + if (err) + goto err_mcs_free; + err = register_netdev(netdev); if (err) { dev_err(dev, "Failed to register netdevice\n"); - goto err_mcs_free; + goto err_ipsec_clean; } err = otx2_wq_init(pf); @@ -3206,6 +3220,8 @@ err_mcam_flow_del: otx2_mcam_flow_del(pf); err_unreg_netdev: unregister_netdev(netdev); +err_ipsec_clean: + cn10k_ipsec_clean(pf); err_mcs_free: cn10k_mcs_free(pf); err_del_mcam_entries: @@ -3403,6 +3419,7 @@ static void otx2_remove(struct pci_dev *pdev) otx2_unregister_dl(pf); unregister_netdev(netdev); + cn10k_ipsec_clean(pf); cn10k_mcs_free(pf); otx2_sriov_disable(pf->pdev); otx2_sriov_vfcfg_cleanup(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 04bc06a80e23..224cef938927 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -11,6 +11,7 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/ip6_checksum.h> +#include <net/xfrm.h> #include "otx2_reg.h" #include "otx2_common.h" @@ -26,12 +27,25 @@ */ #define PTP_SYNC_SEC_OFFSET 34 +DEFINE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct bpf_prog *prog, struct nix_cqe_rx_s *cqe, struct otx2_cq_queue *cq, bool *need_xdp_flush); +static void otx2_sq_set_sqe_base(struct otx2_snd_queue *sq, + struct sk_buff *skb) +{ + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + sq->sqe_base = sq->sqe_ring->base + sq->sqe_size + + (sq->head * (sq->sqe_size * 2)); + else + sq->sqe_base = sq->sqe->base; +} + static int otx2_nix_cq_op_status(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) { @@ -80,38 +94,6 @@ static unsigned int frag_num(unsigned int i) #endif } -static dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, - struct sk_buff *skb, int seg, int *len) -{ - const skb_frag_t *frag; - struct page *page; - int offset; - - /* First segment is always skb->data */ - if (!seg) { - page = virt_to_page(skb->data); - offset = offset_in_page(skb->data); - *len = skb_headlen(skb); - } else { - frag = &skb_shinfo(skb)->frags[seg - 1]; - page = skb_frag_page(frag); - offset = skb_frag_off(frag); - *len = skb_frag_size(frag); - } - return otx2_dma_map_page(pfvf, page, offset, *len, DMA_TO_DEVICE); -} - -static void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg) -{ - int seg; - - for (seg = 0; seg < sg->num_segs; seg++) { - otx2_dma_unmap_page(pfvf, sg->dma_addr[seg], - sg->size[seg], DMA_TO_DEVICE); - } - sg->num_segs = 0; -} - static void otx2_xdp_snd_pkt_handler(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, struct nix_cqe_tx_s *cqe) @@ -625,7 +607,6 @@ void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, sq->head &= (sq->sqe_cnt - 1); } -#define MAX_SEGS_PER_SG 3 /* Add SQE scatter/gather subdescriptor structure */ static bool otx2_sqe_add_sg(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, struct sk_buff *skb, int num_segs, int *offset) @@ -1161,6 +1142,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, int offset, num_segs, free_desc; struct nix_sqe_hdr_s *sqe_hdr; struct otx2_nic *pfvf = dev; + bool ret; /* Check if there is enough room between producer * and consumer index. @@ -1177,6 +1159,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, /* If SKB doesn't fit in a single SQE, linearize it. * TODO: Consider adding JUMP descriptor instead. */ + if (unlikely(num_segs > OTX2_MAX_FRAGS_IN_SQE)) { if (__skb_linearize(skb)) { dev_kfree_skb_any(skb); @@ -1196,6 +1179,9 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, return true; } + /* Set sqe base address */ + otx2_sq_set_sqe_base(sq, skb); + /* Set SQE's SEND_HDR. * Do not clear the first 64bit as it contains constant info. */ @@ -1208,7 +1194,13 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, otx2_sqe_add_ext(pfvf, sq, skb, &offset); /* Add SG subdesc with data frags */ - if (!otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset)) { + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + ret = otx2_sqe_add_sg_ipsec(pfvf, sq, skb, num_segs, &offset); + else + ret = otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset); + + if (!ret) { otx2_dma_unmap_skb_frags(pfvf, &sq->sg[sq->head]); return false; } @@ -1217,11 +1209,15 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, sqe_hdr->sizem1 = (offset / 16) - 1; + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + return cn10k_ipsec_transmit(pfvf, txq, sq, skb, num_segs, + offset); + netdev_tx_sent_queue(txq, skb->len); /* Flush SQE to HW */ pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); - return true; } EXPORT_SYMBOL(otx2_sq_append_skb); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index e1db5f961877..d23810963fdb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -101,6 +101,9 @@ struct otx2_snd_queue { struct queue_stats stats; u16 sqb_count; u64 *sqb_ptrs; + /* SQE ring and CPT response queue for Inline IPSEC */ + struct qmem *sqe_ring; + struct qmem *cpt_resp; } ____cacheline_aligned_in_smp; enum cq_type { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 839fc77c11b2..e926c6ce96cf 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -14,6 +14,7 @@ #include "otx2_reg.h" #include "otx2_ptp.h" #include "cn10k.h" +#include "cn10k_ipsec.h" #define DRV_NAME "rvu_nicvf" #define DRV_STRING "Marvell RVU NIC Virtual Function Driver" @@ -693,10 +694,14 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) pdev->bus->number, n); } + err = cn10k_ipsec_init(netdev); + if (err) + goto err_ptp_destroy; + err = register_netdev(netdev); if (err) { dev_err(dev, "Failed to register netdevice\n"); - goto err_ptp_destroy; + goto err_ipsec_clean; } err = otx2_vf_wq_init(vf); @@ -730,6 +735,8 @@ err_shutdown_tc: otx2_shutdown_tc(vf); err_unreg_netdev: unregister_netdev(netdev); +err_ipsec_clean: + cn10k_ipsec_clean(vf); err_ptp_destroy: otx2_ptp_destroy(vf); err_detach_rsrc: @@ -782,6 +789,7 @@ static void otx2vf_remove(struct pci_dev *pdev) unregister_netdev(netdev); if (vf->otx2_wq) destroy_workqueue(vf->otx2_wq); + cn10k_ipsec_clean(vf); otx2_ptp_destroy(vf); otx2_mcam_flow_del(vf); otx2_shutdown_tc(vf); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c3..15e765a41d72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3946,6 +3946,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) } stats->rx_missed_errors = priv->stats.qcnt.rx_out_of_buffer; + stats->rx_dropped = PPORT_2863_GET(pstats, if_in_discards); stats->rx_length_errors = PPORT_802_3_GET(pstats, a_in_range_length_errors) + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index e393391966e0..39a209b9b684 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -56,6 +56,8 @@ bool mlx5_qos_tsar_type_supported(struct mlx5_core_dev *dev, int type, u8 hierar return cap & TSAR_TYPE_CAP_MASK_ROUND_ROBIN; case TSAR_ELEMENT_TSAR_TYPE_ETS: return cap & TSAR_TYPE_CAP_MASK_ETS; + case TSAR_ELEMENT_TSAR_TYPE_TC_ARB: + return cap & TSAR_TYPE_CAP_MASK_TC_ARB; } return false; @@ -87,6 +89,8 @@ bool mlx5_qos_element_type_supported(struct mlx5_core_dev *dev, int type, u8 hie return cap & ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC; case SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP: return cap & ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT: + return cap & ELEMENT_TYPE_CAP_MASK_RATE_LIMIT; } return false; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 6d11225594dd..24c3ff6fcf71 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -73,6 +73,7 @@ struct mlxsw_tx_info { }; struct mlxsw_rx_md_info { + struct napi_struct *napi; u32 cookie_index; u32 latency; u32 tx_congestion; diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index d6f37456fb31..0863dca2fc0b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -737,6 +737,7 @@ static void mlxsw_pci_cqe_rdq_md_init(struct sk_buff *skb, const char *cqe) } static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, + struct napi_struct *napi, struct mlxsw_pci_queue *q, u16 consumer_counter_limit, enum mlxsw_pci_cqe_v cqe_v, char *cqe) @@ -807,6 +808,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, } mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe); + mlxsw_skb_cb(skb)->rx_md_info.napi = napi; mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info); @@ -869,7 +871,7 @@ static int mlxsw_pci_napi_poll_cq_rx(struct napi_struct *napi, int budget) continue; } - mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq, + mlxsw_pci_cqe_rdq_handle(mlxsw_pci, napi, rdq, wqe_counter, q->u.cq.v, cqe); if (++work_done == budget) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 3f5e5d99251b..aa71993daf28 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2449,7 +2449,7 @@ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u64_stats_update_end(&pcpu_stats->syncp); skb->protocol = eth_type_trans(skb, skb->dev); - netif_receive_skb(skb); + napi_gro_receive(mlxsw_skb_cb(skb)->rx_md_info.napi, skb); } static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u16 local_port, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index f07955b5439f..6a4a81c63451 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -192,6 +192,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } + if (sample_act_count) { + NL_SET_ERR_MSG_MOD(extack, "Mirror action after sample action is not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei, block, out_dev, extack); @@ -265,6 +270,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } + if (mirror_act_count) { + NL_SET_ERR_MSG_MOD(extack, "Sample action after mirror action is not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_sample(mlxsw_sp, rulei, block, act->sample.psample_group, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 899c954e0e5f..1f9c1c86839f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -173,7 +173,7 @@ static void mlxsw_sp_rx_no_mark_listener(struct sk_buff *skb, u16 local_port, if (err) return; - netif_receive_skb(skb); + napi_gro_receive(mlxsw_skb_cb(skb)->rx_md_info.napi, skb); } static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u16 local_port, diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 558e03301aa8..8d48468cddd7 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -758,12 +758,13 @@ static int ocelot_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct ocelot_dump_ctx *dump = data; + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 98e098c09c03..abba165738a3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2779,7 +2779,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn) break; } - netdev->watchdog_timeo = msecs_to_jiffies(5 * 1000); + netdev->watchdog_timeo = secs_to_jiffies(5); /* MTU range: 68 - hw-specific max */ netdev->min_mtu = ETH_MIN_MTU; diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 1c61390677f7..04f00ea94230 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -18,8 +18,6 @@ struct ionic_lif; #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF 0x1002 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003 -#define IONIC_ASIC_TYPE_ELBA 2 - #define DEVCMD_TIMEOUT 5 #define IONIC_ADMINQ_TIME_SLICE msecs_to_jiffies(100) @@ -59,7 +57,6 @@ struct ionic { DECLARE_BITMAP(intrs, IONIC_INTR_CTRL_REGS_MAX); cpumask_var_t *affinity_masks; struct delayed_work doorbell_check_dwork; - struct work_struct nb_work; struct notifier_block nb; struct rw_semaphore vf_op_lock; /* lock for VF operations */ struct ionic_vf *vfs; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index 9b7f78b6cdb1..a2d4336d2766 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -158,6 +158,20 @@ static int ionic_get_link_ksettings(struct net_device *netdev, 25000baseCR_Full); copper_seen++; break; + case IONIC_XCVR_PID_QSFP_50G_CR2_FC: + case IONIC_XCVR_PID_QSFP_50G_CR2: + ethtool_link_ksettings_add_link_mode(ks, supported, + 50000baseCR2_Full); + copper_seen++; + break; + case IONIC_XCVR_PID_QSFP_200G_CR4: + ethtool_link_ksettings_add_link_mode(ks, supported, 200000baseCR4_Full); + copper_seen++; + break; + case IONIC_XCVR_PID_QSFP_400G_CR4: + ethtool_link_ksettings_add_link_mode(ks, supported, 400000baseCR4_Full); + copper_seen++; + break; case IONIC_XCVR_PID_SFP_10GBASE_AOC: case IONIC_XCVR_PID_SFP_10GBASE_CU: ethtool_link_ksettings_add_link_mode(ks, supported, @@ -196,6 +210,31 @@ static int ionic_get_link_ksettings(struct net_device *netdev, ethtool_link_ksettings_add_link_mode(ks, supported, 25000baseSR_Full); break; + case IONIC_XCVR_PID_QSFP_200G_AOC: + case IONIC_XCVR_PID_QSFP_200G_SR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseSR4_Full); + break; + case IONIC_XCVR_PID_QSFP_200G_FR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseLR4_ER4_FR4_Full); + break; + case IONIC_XCVR_PID_QSFP_200G_DR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseDR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_FR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseLR4_ER4_FR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_DR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseDR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_SR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseSR4_Full); + break; case IONIC_XCVR_PID_SFP_10GBASE_SR: ethtool_link_ksettings_add_link_mode(ks, supported, 10000baseSR_Full); @@ -929,6 +968,7 @@ static int ionic_get_module_info(struct net_device *netdev, break; case SFF8024_ID_QSFP_8436_8636: case SFF8024_ID_QSFP28_8636: + case SFF8024_ID_QSFP_PLUS_CMIS: modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; break; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 9c85c0706c6e..830c8adbfbee 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -1277,7 +1277,10 @@ enum ionic_xcvr_pid { IONIC_XCVR_PID_SFP_25GBASE_CR_S = 3, IONIC_XCVR_PID_SFP_25GBASE_CR_L = 4, IONIC_XCVR_PID_SFP_25GBASE_CR_N = 5, - + IONIC_XCVR_PID_QSFP_50G_CR2_FC = 6, + IONIC_XCVR_PID_QSFP_50G_CR2 = 7, + IONIC_XCVR_PID_QSFP_200G_CR4 = 8, + IONIC_XCVR_PID_QSFP_400G_CR4 = 9, /* Fiber */ IONIC_XCVR_PID_QSFP_100G_AOC = 50, IONIC_XCVR_PID_QSFP_100G_ACC = 51, @@ -1303,6 +1306,15 @@ enum ionic_xcvr_pid { IONIC_XCVR_PID_SFP_25GBASE_ACC = 71, IONIC_XCVR_PID_SFP_10GBASE_T = 72, IONIC_XCVR_PID_SFP_1000BASE_T = 73, + IONIC_XCVR_PID_QSFP_200G_AOC = 74, + IONIC_XCVR_PID_QSFP_200G_FR4 = 75, + IONIC_XCVR_PID_QSFP_200G_DR4 = 76, + IONIC_XCVR_PID_QSFP_200G_SR4 = 77, + IONIC_XCVR_PID_QSFP_200G_ACC = 78, + IONIC_XCVR_PID_QSFP_400G_FR4 = 79, + IONIC_XCVR_PID_QSFP_400G_DR4 = 80, + IONIC_XCVR_PID_QSFP_400G_SR4 = 81, + IONIC_XCVR_PID_QSFP_400G_VR4 = 82, }; /** @@ -1404,6 +1416,8 @@ struct ionic_xcvr_status { */ union ionic_port_config { struct { +#define IONIC_SPEED_400G 400000 /* 400G in Mbps */ +#define IONIC_SPEED_200G 200000 /* 200G in Mbps */ #define IONIC_SPEED_100G 100000 /* 100G in Mbps */ #define IONIC_SPEED_50G 50000 /* 50G in Mbps */ #define IONIC_SPEED_40G 40000 /* 40G in Mbps */ @@ -3209,7 +3223,11 @@ union ionic_adminq_comp { #define IONIC_BAR0_INTR_CTRL_OFFSET 0x2000 #define IONIC_DEV_CMD_DONE 0x00000001 -#define IONIC_ASIC_TYPE_CAPRI 0 +#define IONIC_ASIC_TYPE_NONE 0 +#define IONIC_ASIC_TYPE_CAPRI 1 +#define IONIC_ASIC_TYPE_ELBA 2 +#define IONIC_ASIC_TYPE_GIGLIO 3 +#define IONIC_ASIC_TYPE_SALINA 4 /** * struct ionic_doorbell - Doorbell register layout diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 3d3f936779f7..7707a9e53c43 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3265,7 +3265,7 @@ int ionic_lif_alloc(struct ionic *ionic) lif->netdev->min_mtu = max_t(unsigned int, ETH_MIN_MTU, le32_to_cpu(lif->identity->eth.min_frame_size)); lif->netdev->max_mtu = - le32_to_cpu(lif->identity->eth.max_frame_size) - ETH_HLEN - VLAN_HLEN; + le32_to_cpu(lif->identity->eth.max_frame_size) - VLAN_ETH_HLEN; lif->neqs = ionic->neqs_per_lif; lif->nxqs = ionic->ntxqs_per_lif; @@ -3804,10 +3804,6 @@ err_out_adminq_deinit: return err; } -static void ionic_lif_notify_work(struct work_struct *ws) -{ -} - static void ionic_lif_set_netdev_info(struct ionic_lif *lif) { struct ionic_admin_ctx ctx = { @@ -3858,8 +3854,6 @@ int ionic_lif_register(struct ionic_lif *lif) ionic_lif_register_phc(lif); - INIT_WORK(&lif->ionic->nb_work, ionic_lif_notify_work); - lif->ionic->nb.notifier_call = ionic_lif_notify; err = register_netdevice_notifier(&lif->ionic->nb); @@ -3885,7 +3879,6 @@ void ionic_lif_unregister(struct ionic_lif *lif) { if (lif->ionic->nb.notifier_call) { unregister_netdevice_notifier(&lif->ionic->nb); - cancel_work_sync(&lif->ionic->nb_work); lif->ionic->nb.notifier_call = NULL; } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 0f817c3f92d8..daf1e82cb76b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -81,8 +81,9 @@ static int ionic_error_to_errno(enum ionic_status_code code) case IONIC_RC_EQTYPE: case IONIC_RC_EQID: case IONIC_RC_EINVAL: - case IONIC_RC_ENOSUPP: return -EINVAL; + case IONIC_RC_ENOSUPP: + return -EOPNOTSUPP; case IONIC_RC_EPERM: return -EPERM; case IONIC_RC_ENOENT: diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 9cff0a8ffb2c..3383ee1dad14 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -2832,7 +2832,7 @@ netxen_sysfs_validate_crb(struct netxen_adapter *adapter, static ssize_t netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2860,7 +2860,7 @@ netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj, static ssize_t netxen_sysfs_write_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2901,7 +2901,7 @@ netxen_sysfs_validate_mem(struct netxen_adapter *adapter, static ssize_t netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2922,7 +2922,7 @@ netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj, } static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2946,20 +2946,20 @@ static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj, static const struct bin_attribute bin_attr_crb = { .attr = { .name = "crb", .mode = 0644 }, .size = 0, - .read = netxen_sysfs_read_crb, - .write = netxen_sysfs_write_crb, + .read_new = netxen_sysfs_read_crb, + .write_new = netxen_sysfs_write_crb, }; static const struct bin_attribute bin_attr_mem = { .attr = { .name = "mem", .mode = 0644 }, .size = 0, - .read = netxen_sysfs_read_mem, - .write = netxen_sysfs_write_mem, + .read_new = netxen_sysfs_read_mem, + .write_new = netxen_sysfs_write_mem, }; static ssize_t netxen_sysfs_read_dimm(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -3082,7 +3082,7 @@ out: static const struct bin_attribute bin_attr_dimm = { .attr = { .name = "dimm", .mode = 0644 }, .size = sizeof(struct netxen_dimm_cfg), - .read = netxen_sysfs_read_dimm, + .read_new = netxen_sysfs_read_dimm, }; diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index be4c9622618d..e0817f2a311a 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -23,7 +23,7 @@ enum mac_version { RTL_GIGA_MAC_VER_08, RTL_GIGA_MAC_VER_09, RTL_GIGA_MAC_VER_10, - RTL_GIGA_MAC_VER_11, + /* support for RTL_GIGA_MAC_VER_11 has been removed */ /* RTL_GIGA_MAC_VER_12 was handled the same as VER_17 */ /* RTL_GIGA_MAC_VER_13 was merged with VER_10 */ RTL_GIGA_MAC_VER_14, @@ -70,7 +70,8 @@ enum mac_version { RTL_GIGA_MAC_VER_63, RTL_GIGA_MAC_VER_64, RTL_GIGA_MAC_VER_65, - RTL_GIGA_MAC_VER_66, + RTL_GIGA_MAC_VER_70, + RTL_GIGA_MAC_VER_71, RTL_GIGA_MAC_NONE }; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 739707a7b40f..5724f650f9c6 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -57,6 +57,7 @@ #define FIRMWARE_8125A_3 "rtl_nic/rtl8125a-3.fw" #define FIRMWARE_8125B_2 "rtl_nic/rtl8125b-2.fw" #define FIRMWARE_8125D_1 "rtl_nic/rtl8125d-1.fw" +#define FIRMWARE_8125D_2 "rtl_nic/rtl8125d-2.fw" #define FIRMWARE_8126A_2 "rtl_nic/rtl8126a-2.fw" #define FIRMWARE_8126A_3 "rtl_nic/rtl8126a-3.fw" @@ -104,7 +105,6 @@ static const struct { [RTL_GIGA_MAC_VER_08] = {"RTL8102e" }, [RTL_GIGA_MAC_VER_09] = {"RTL8102e/RTL8103e" }, [RTL_GIGA_MAC_VER_10] = {"RTL8101e/RTL8100e" }, - [RTL_GIGA_MAC_VER_11] = {"RTL8168b/8111b" }, [RTL_GIGA_MAC_VER_14] = {"RTL8401" }, [RTL_GIGA_MAC_VER_17] = {"RTL8168b/8111b" }, [RTL_GIGA_MAC_VER_18] = {"RTL8168cp/8111cp" }, @@ -141,8 +141,9 @@ static const struct { /* reserve 62 for CFG_METHOD_4 in the vendor driver */ [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, - [RTL_GIGA_MAC_VER_65] = {"RTL8126A", FIRMWARE_8126A_2}, - [RTL_GIGA_MAC_VER_66] = {"RTL8126A", FIRMWARE_8126A_3}, + [RTL_GIGA_MAC_VER_65] = {"RTL8125D", FIRMWARE_8125D_2}, + [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, + [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, }; static const struct pci_device_id rtl8169_pci_tbl[] = { @@ -623,7 +624,6 @@ struct rtl8169_tc_offsets { enum rtl_flag { RTL_FLAG_TASK_RESET_PENDING, - RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, RTL_FLAG_TASK_TX_TIMEOUT, RTL_FLAG_MAX }; @@ -708,6 +708,7 @@ MODULE_FIRMWARE(FIRMWARE_8107E_2); MODULE_FIRMWARE(FIRMWARE_8125A_3); MODULE_FIRMWARE(FIRMWARE_8125B_2); MODULE_FIRMWARE(FIRMWARE_8125D_1); +MODULE_FIRMWARE(FIRMWARE_8125D_2); MODULE_FIRMWARE(FIRMWARE_8126A_2); MODULE_FIRMWARE(FIRMWARE_8126A_3); @@ -1230,7 +1231,7 @@ static void rtl_writephy(struct rtl8169_private *tp, int location, int val) case RTL_GIGA_MAC_VER_31: r8168dp_2_mdio_write(tp, location, val); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168g_mdio_write(tp, location, val); break; default: @@ -1245,7 +1246,7 @@ static int rtl_readphy(struct rtl8169_private *tp, int location) case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: return r8168dp_2_mdio_read(tp, location); - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: return r8168g_mdio_read(tp, location); default: return r8169_mdio_read(tp, location); @@ -1576,7 +1577,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) break; case RTL_GIGA_MAC_VER_34: case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_71: r8169_mod_reg8_cond(tp, Config2, PME_SIGNAL, wolopts); break; default: @@ -2049,7 +2050,7 @@ static void rtl_set_eee_txidle_timer(struct rtl8169_private *tp) tp->tx_lpi_timer = timer_val; r8168_mac_ocp_write(tp, 0xe048, timer_val); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: tp->tx_lpi_timer = timer_val; RTL_W16(tp, EEE_TXIDLE_TIMER_8125, timer_val); break; @@ -2257,10 +2258,11 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) enum mac_version ver; } mac_info[] = { /* 8126A family. */ - { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_66 }, - { 0x7cf, 0x649, RTL_GIGA_MAC_VER_65 }, + { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71 }, + { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, /* 8125D family. */ + { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65 }, { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, /* 8125B family. */ @@ -2336,7 +2338,7 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) /* 8168B family. */ { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17 }, - /* This one is very old and rare, let's see if anybody complains. + /* This one is very old and rare, support has been removed. * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11 }, */ @@ -2528,7 +2530,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST | RX_PAUSE_SLOT_ON); break; @@ -2660,7 +2662,7 @@ static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_61: rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond_2, 100, 42); @@ -2903,7 +2905,7 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_38: rtl_eri_set_bits(tp, 0xd4, 0x0c00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xc0ac, 0, 0x1f80); break; default: @@ -2917,7 +2919,7 @@ static void rtl_disable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: rtl_eri_clear_bits(tp, 0xd4, 0x1f00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xc0ac, 0x1f80, 0); break; default: @@ -2943,8 +2945,8 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) rtl_mod_config5(tp, 0, ASPM_en); switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) | INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -2955,7 +2957,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: /* reset ephy tx/rx disable timer */ r8168_mac_ocp_modify(tp, 0xe094, 0xff00, 0); /* chip can trigger L1.2 */ @@ -2967,7 +2969,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } else { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, 0); break; default: @@ -2975,8 +2977,8 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) & ~INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -3696,12 +3698,12 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) /* disable new tx descriptor format */ r8168_mac_ocp_modify(tp, 0xeb58, 0x0001, 0x0000); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) RTL_W8(tp, 0xD8, RTL_R8(tp, 0xD8) & ~0x02); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0400); else if (tp->mac_version == RTL_GIGA_MAC_VER_63) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0200); @@ -3719,8 +3721,8 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) r8168_mac_ocp_modify(tp, 0xe056, 0x00f0, 0x0030); r8168_mac_ocp_modify(tp, 0xe040, 0x1000, 0x0000); r8168_mac_ocp_modify(tp, 0xea1c, 0x0003, 0x0001); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) r8168_mac_ocp_modify(tp, 0xea1c, 0x0300, 0x0000); else r8168_mac_ocp_modify(tp, 0xea1c, 0x0004, 0x0000); @@ -3804,7 +3806,6 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_08] = rtl_hw_start_8102e_3, [RTL_GIGA_MAC_VER_09] = rtl_hw_start_8102e_2, [RTL_GIGA_MAC_VER_10] = NULL, - [RTL_GIGA_MAC_VER_11] = rtl_hw_start_8168b, [RTL_GIGA_MAC_VER_14] = rtl_hw_start_8401, [RTL_GIGA_MAC_VER_17] = rtl_hw_start_8168b, [RTL_GIGA_MAC_VER_18] = rtl_hw_start_8168cp_1, @@ -3840,8 +3841,9 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, - [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8126a, - [RTL_GIGA_MAC_VER_66] = rtl_hw_start_8126a, + [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d, + [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, + [RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a, }; if (hw_configs[tp->mac_version]) @@ -3858,12 +3860,13 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_61: case RTL_GIGA_MAC_VER_64: + case RTL_GIGA_MAC_VER_65: for (i = 0xa00; i < 0xb00; i += 4) RTL_W32(tp, i, 0); break; case RTL_GIGA_MAC_VER_63: - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: for (i = 0xa00; i < 0xa80; i += 4) RTL_W32(tp, i, 0); RTL_W16(tp, INT_CFG1_8125, 0x0000); @@ -4095,7 +4098,7 @@ static void rtl8169_cleanup(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: rtl_enable_rxdvgate(tp); fsleep(2000); break; @@ -4252,7 +4255,7 @@ static unsigned int rtl_quirk_packet_padto(struct rtl8169_private *tp, switch (tp->mac_version) { case RTL_GIGA_MAC_VER_34: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: padto = max_t(unsigned int, padto, ETH_ZLEN); break; default: @@ -4680,12 +4683,6 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) if (status & LinkChg) phy_mac_interrupt(tp->phydev); - if (unlikely(status & RxFIFOOver && - tp->mac_version == RTL_GIGA_MAC_VER_11)) { - netif_stop_queue(tp->dev); - rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); - } - rtl_irq_disable(tp); napi_schedule(&tp->napi); out: @@ -4723,8 +4720,6 @@ static void rtl_task(struct work_struct *work) reset: rtl_reset_work(tp); netif_wake_queue(tp->dev); - } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) { - rtl_reset_work(tp); } } @@ -5103,9 +5098,6 @@ static void rtl_set_irq_mask(struct rtl8169_private *tp) if (tp->mac_version <= RTL_GIGA_MAC_VER_06) tp->irq_mask |= SYSErr | RxFIFOOver; - else if (tp->mac_version == RTL_GIGA_MAC_VER_11) - /* special workaround needed */ - tp->irq_mask |= RxFIFOOver; } static int rtl_alloc_irq(struct rtl8169_private *tp) @@ -5281,7 +5273,7 @@ static void rtl_hw_initialize(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: rtl_hw_init_8168g(tp); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: rtl_hw_init_8125(tp); break; default: @@ -5300,7 +5292,6 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: return JUMBO_7K; /* RTL8168b */ - case RTL_GIGA_MAC_VER_11: case RTL_GIGA_MAC_VER_17: return JUMBO_4K; /* RTL8168c */ @@ -5347,13 +5338,6 @@ static bool rtl_aspm_is_safe(struct rtl8169_private *tp) return false; } -static umode_t r8169_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - return 0444; -} - static int r8169_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { @@ -5370,7 +5354,7 @@ static int r8169_hwmon_read(struct device *dev, enum hwmon_sensor_types type, } static const struct hwmon_ops r8169_hwmon_ops = { - .is_visible = r8169_hwmon_is_visible, + .visible = 0444, .read = r8169_hwmon_read, }; diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 5307c6ff4e25..968c8a2185a4 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -276,15 +276,6 @@ static void rtl8169sce_hw_phy_config(struct rtl8169_private *tp, rtl_writephy_batch(phydev, phy_reg_init); } -static void rtl8168bb_hw_phy_config(struct rtl8169_private *tp, - struct phy_device *phydev) -{ - phy_write(phydev, 0x1f, 0x0001); - phy_set_bits(phydev, 0x16, BIT(0)); - phy_write(phydev, 0x10, 0xf41b); - phy_write(phydev, 0x1f, 0x0000); -} - static void rtl8168bef_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev) { @@ -1136,7 +1127,6 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_08] = rtl8102e_hw_phy_config, [RTL_GIGA_MAC_VER_09] = rtl8102e_hw_phy_config, [RTL_GIGA_MAC_VER_10] = NULL, - [RTL_GIGA_MAC_VER_11] = rtl8168bb_hw_phy_config, [RTL_GIGA_MAC_VER_14] = rtl8401_hw_phy_config, [RTL_GIGA_MAC_VER_17] = rtl8168bef_hw_phy_config, [RTL_GIGA_MAC_VER_18] = rtl8168cp_1_hw_phy_config, @@ -1172,8 +1162,9 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_61] = rtl8125a_2_hw_phy_config, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, - [RTL_GIGA_MAC_VER_65] = rtl8126a_hw_phy_config, - [RTL_GIGA_MAC_VER_66] = rtl8126a_hw_phy_config, + [RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config, + [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, + [RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config, }; if (phy_configs[ver]) diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index dbc3f92eebc4..2bbfcad613ab 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -13,6 +13,7 @@ #define RTASE_HW_VER_906X_7XA 0x00800000 #define RTASE_HW_VER_906X_7XC 0x04000000 #define RTASE_HW_VER_907XD_V1 0x04800000 +#define RTASE_HW_VER_907XD_VA 0x08000000 #define RTASE_RX_DMA_BURST_256 4 #define RTASE_TX_DMA_BURST_UNLIMITED 7 diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index de7f11232593..585d0b21c9e0 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1725,6 +1725,7 @@ static int rtase_get_settings(struct net_device *dev, cmd->base.speed = SPEED_5000; break; case RTASE_HW_VER_907XD_V1: + case RTASE_HW_VER_907XD_VA: cmd->base.speed = SPEED_10000; break; } @@ -1993,6 +1994,7 @@ static int rtase_check_mac_version_valid(struct rtase_private *tp) case RTASE_HW_VER_906X_7XA: case RTASE_HW_VER_906X_7XC: case RTASE_HW_VER_907XD_V1: + case RTASE_HW_VER_907XD_VA: ret = 0; break; } @@ -2016,7 +2018,7 @@ static int rtase_init_board(struct pci_dev *pdev, struct net_device **dev_out, SET_NETDEV_DEV(dev, &pdev->dev); ret = pci_enable_device(pdev); - if (ret < 0) + if (ret) goto err_out_free_dev; /* make sure PCI base addr 1 is MMIO */ @@ -2032,7 +2034,7 @@ static int rtase_init_board(struct pci_dev *pdev, struct net_device **dev_out, } ret = pci_request_regions(pdev, KBUILD_MODNAME); - if (ret < 0) + if (ret) goto err_out_disable; ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); @@ -2108,7 +2110,7 @@ static int rtase_init_one(struct pci_dev *pdev, dev_dbg(&pdev->dev, "Automotive Switch Ethernet driver loaded\n"); ret = rtase_init_board(pdev, &dev, &ioaddr); - if (ret != 0) + if (ret) return ret; tp = netdev_priv(dev); @@ -2118,7 +2120,7 @@ static int rtase_init_one(struct pci_dev *pdev, /* identify chip attached to board */ ret = rtase_check_mac_version_valid(tp); - if (ret != 0) { + if (ret) { dev_err(&pdev->dev, "unknown chip version: 0x%08x, contact rtase maintainers (see MAINTAINERS file)\n", tp->hw_ver); @@ -2129,7 +2131,7 @@ static int rtase_init_one(struct pci_dev *pdev, rtase_init_hardware(tp); ret = rtase_alloc_interrupt(pdev, tp); - if (ret < 0) { + if (ret) { dev_err(&pdev->dev, "unable to alloc MSIX/MSI\n"); goto err_out_del_napi; } @@ -2174,7 +2176,7 @@ static int rtase_init_one(struct pci_dev *pdev, netif_carrier_off(dev); ret = register_netdev(dev); - if (ret != 0) + if (ret) goto err_out_free_dma; netdev_dbg(dev, "%pM, IRQ %d\n", dev->dev_addr, dev->irq); diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 9ac6e2aad18f..84d09a8973b7 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -111,25 +111,35 @@ static void rswitch_top_init(struct rswitch_private *priv) /* Forwarding engine block (MFWD) */ static void rswitch_fwd_init(struct rswitch_private *priv) { + u32 all_ports_mask = GENMASK(RSWITCH_NUM_AGENTS - 1, 0); unsigned int i; - /* For ETHA */ - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { - iowrite32(FWPC0_DEFAULT, priv->addr + FWPC0(i)); + /* Start with empty configuration */ + for (i = 0; i < RSWITCH_NUM_AGENTS; i++) { + /* Disable all port features */ + iowrite32(0, priv->addr + FWPC0(i)); + /* Disallow L3 forwarding and direct descriptor forwarding */ + iowrite32(FIELD_PREP(FWCP1_LTHFW, all_ports_mask), + priv->addr + FWPC1(i)); + /* Disallow L2 forwarding */ + iowrite32(FIELD_PREP(FWCP2_LTWFW, all_ports_mask), + priv->addr + FWPC2(i)); + /* Disallow port based forwarding */ iowrite32(0, priv->addr + FWPBFC(i)); } - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + /* For enabled ETHA ports, setup port based forwarding */ + rswitch_for_each_enabled_port(priv, i) { + /* Port based forwarding from port i to GWCA port */ + rswitch_modify(priv->addr, FWPBFC(i), FWPBFC_PBDV, + FIELD_PREP(FWPBFC_PBDV, BIT(priv->gwca.index))); + /* Within GWCA port, forward to Rx queue for port i */ iowrite32(priv->rdev[i]->rx_queue->index, priv->addr + FWPBFCSDC(GWCA_INDEX, i)); - iowrite32(BIT(priv->gwca.index), priv->addr + FWPBFC(i)); } - /* For GWCA */ - iowrite32(FWPC0_DEFAULT, priv->addr + FWPC0(priv->gwca.index)); - iowrite32(FWPC1_DDE, priv->addr + FWPC1(priv->gwca.index)); - iowrite32(0, priv->addr + FWPBFC(priv->gwca.index)); - iowrite32(GENMASK(RSWITCH_NUM_PORTS - 1, 0), priv->addr + FWPBFC(priv->gwca.index)); + /* For GWCA port, allow direct descriptor forwarding */ + rswitch_modify(priv->addr, FWPC1(priv->gwca.index), FWPC1_DDE, FWPC1_DDE); } /* Gateway CPU agent block (GWCA) */ @@ -1159,9 +1169,9 @@ static void rswitch_rmac_setting(struct rswitch_etha *etha, const u8 *mac) static void rswitch_etha_enable_mii(struct rswitch_etha *etha) { - rswitch_modify(etha->addr, MPIC, MPIC_PSMCS_MASK | MPIC_PSMHT_MASK, - MPIC_PSMCS(etha->psmcs) | MPIC_PSMHT(0x06)); - rswitch_modify(etha->addr, MPSM, 0, MPSM_MFF_C45); + rswitch_modify(etha->addr, MPIC, MPIC_PSMCS | MPIC_PSMHT, + FIELD_PREP(MPIC_PSMCS, etha->psmcs) | + FIELD_PREP(MPIC_PSMHT, 0x06)); } static int rswitch_etha_hw_init(struct rswitch_etha *etha, const u8 *mac) @@ -1190,42 +1200,29 @@ static int rswitch_etha_hw_init(struct rswitch_etha *etha, const u8 *mac) return rswitch_etha_change_mode(etha, EAMC_OPC_OPERATION); } -static int rswitch_etha_set_access(struct rswitch_etha *etha, bool read, - int phyad, int devad, int regad, int data) +static int rswitch_etha_mpsm_op(struct rswitch_etha *etha, bool read, + unsigned int mmf, unsigned int pda, + unsigned int pra, unsigned int pop, + unsigned int prd) { - int pop = read ? MDIO_READ_C45 : MDIO_WRITE_C45; u32 val; int ret; - if (devad == 0xffffffff) - return -ENODEV; - - writel(MMIS1_CLEAR_FLAGS, etha->addr + MMIS1); + val = MPSM_PSME | + FIELD_PREP(MPSM_MFF, mmf) | + FIELD_PREP(MPSM_PDA, pda) | + FIELD_PREP(MPSM_PRA, pra) | + FIELD_PREP(MPSM_POP, pop) | + FIELD_PREP(MPSM_PRD, prd); + iowrite32(val, etha->addr + MPSM); - val = MPSM_PSME | MPSM_MFF_C45; - iowrite32((regad << 16) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PAACS, MMIS1_PAACS); + ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); if (ret) return ret; - rswitch_modify(etha->addr, MMIS1, MMIS1_PAACS, MMIS1_PAACS); - if (read) { - writel((pop << 13) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PRACS, MMIS1_PRACS); - if (ret) - return ret; - - ret = (ioread32(etha->addr + MPSM) & MPSM_PRD_MASK) >> 16; - - rswitch_modify(etha->addr, MMIS1, MMIS1_PRACS, MMIS1_PRACS); - } else { - iowrite32((data << 16) | (pop << 13) | (devad << 8) | (phyad << 3) | val, - etha->addr + MPSM); - - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PWACS, MMIS1_PWACS); + val = ioread32(etha->addr + MPSM); + ret = FIELD_GET(MPSM_PRD, val); } return ret; @@ -1235,16 +1232,47 @@ static int rswitch_etha_mii_read_c45(struct mii_bus *bus, int addr, int devad, int regad) { struct rswitch_etha *etha = bus->priv; + int ret; - return rswitch_etha_set_access(etha, true, addr, devad, regad, 0); + ret = rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_ADDRESS, regad); + if (ret) + return ret; + + return rswitch_etha_mpsm_op(etha, true, MPSM_MMF_C45, addr, devad, + MPSM_POP_READ_C45, 0); } static int rswitch_etha_mii_write_c45(struct mii_bus *bus, int addr, int devad, int regad, u16 val) { struct rswitch_etha *etha = bus->priv; + int ret; - return rswitch_etha_set_access(etha, false, addr, devad, regad, val); + ret = rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_ADDRESS, regad); + if (ret) + return ret; + + return rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_WRITE, val); +} + +static int rswitch_etha_mii_read_c22(struct mii_bus *bus, int phyad, int regad) +{ + struct rswitch_etha *etha = bus->priv; + + return rswitch_etha_mpsm_op(etha, true, MPSM_MMF_C22, phyad, regad, + MPSM_POP_READ_C22, 0); +} + +static int rswitch_etha_mii_write_c22(struct mii_bus *bus, int phyad, + int regad, u16 val) +{ + struct rswitch_etha *etha = bus->priv; + + return rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C22, phyad, regad, + MPSM_POP_WRITE, val); } /* Call of_node_put(port) after done */ @@ -1329,6 +1357,8 @@ static int rswitch_mii_register(struct rswitch_device *rdev) mii_bus->priv = rdev->etha; mii_bus->read_c45 = rswitch_etha_mii_read_c45; mii_bus->write_c45 = rswitch_etha_mii_write_c45; + mii_bus->read = rswitch_etha_mii_read_c22; + mii_bus->write = rswitch_etha_mii_write_c22; mii_bus->parent = &rdev->priv->pdev->dev; mdio_np = of_get_child_by_name(rdev->np_port, "mdio"); @@ -1549,7 +1579,7 @@ static void rswitch_ether_port_deinit_all(struct rswitch_private *priv) { unsigned int i; - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + rswitch_for_each_enabled_port(priv, i) { phy_exit(priv->rdev[i]->serdes); rswitch_ether_port_deinit_one(priv->rdev[i]); } @@ -1924,9 +1954,6 @@ static int rswitch_device_alloc(struct rswitch_private *priv, unsigned int index if (err < 0) goto out_get_params; - if (rdev->priv->gwca.speed < rdev->etha->speed) - rdev->priv->gwca.speed = rdev->etha->speed; - err = rswitch_rxdmac_alloc(ndev); if (err < 0) goto out_rxdmac; diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index d8d4ed7d7f8b..532192cbca4b 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -12,6 +12,7 @@ #define RSWITCH_MAX_NUM_QUEUES 128 +#define RSWITCH_NUM_AGENTS 5 #define RSWITCH_NUM_PORTS 3 #define rswitch_for_each_enabled_port(priv, i) \ for (i = 0; i < RSWITCH_NUM_PORTS; i++) \ @@ -731,28 +732,21 @@ enum rswitch_etha_mode { #define MPIC_LSC_100M 1 #define MPIC_LSC_1G 2 #define MPIC_LSC_2_5G 3 - -#define MDIO_READ_C45 0x03 -#define MDIO_WRITE_C45 0x01 +#define MPIC_PSMCS GENMASK(22, 16) +#define MPIC_PSMHT GENMASK(26, 24) #define MPSM_PSME BIT(0) -#define MPSM_MFF_C45 BIT(2) -#define MPSM_PRD_SHIFT 16 -#define MPSM_PRD_MASK GENMASK(31, MPSM_PRD_SHIFT) - -/* Completion flags */ -#define MMIS1_PAACS BIT(2) /* Address */ -#define MMIS1_PWACS BIT(1) /* Write */ -#define MMIS1_PRACS BIT(0) /* Read */ -#define MMIS1_CLEAR_FLAGS 0xf - -#define MPIC_PSMCS_SHIFT 16 -#define MPIC_PSMCS_MASK GENMASK(22, MPIC_PSMCS_SHIFT) -#define MPIC_PSMCS(val) ((val) << MPIC_PSMCS_SHIFT) - -#define MPIC_PSMHT_SHIFT 24 -#define MPIC_PSMHT_MASK GENMASK(26, MPIC_PSMHT_SHIFT) -#define MPIC_PSMHT(val) ((val) << MPIC_PSMHT_SHIFT) +#define MPSM_MFF BIT(2) +#define MPSM_MMF_C22 0 +#define MPSM_MMF_C45 1 +#define MPSM_PDA GENMASK(7, 3) +#define MPSM_PRA GENMASK(12, 8) +#define MPSM_POP GENMASK(14, 13) +#define MPSM_POP_ADDRESS 0 +#define MPSM_POP_WRITE 1 +#define MPSM_POP_READ_C22 2 +#define MPSM_POP_READ_C45 3 +#define MPSM_PRD GENMASK(31, 16) #define MLVC_PLV BIT(16) @@ -806,6 +800,7 @@ enum rswitch_gwca_mode { #define CABPPFLC_INIT_VALUE 0x00800080 /* MFWD */ +#define FWPC0(i) (FWPC00 + (i) * 0x10) #define FWPC0_LTHTA BIT(0) #define FWPC0_IP4UE BIT(3) #define FWPC0_IP4TE BIT(4) @@ -819,15 +814,15 @@ enum rswitch_gwca_mode { #define FWPC0_MACHMA BIT(27) #define FWPC0_VLANSA BIT(28) -#define FWPC0(i) (FWPC00 + (i) * 0x10) -#define FWPC0_DEFAULT (FWPC0_LTHTA | FWPC0_IP4UE | FWPC0_IP4TE | \ - FWPC0_IP4OE | FWPC0_L2SE | FWPC0_IP4EA | \ - FWPC0_IPDSA | FWPC0_IPHLA | FWPC0_MACSDA | \ - FWPC0_MACHLA | FWPC0_MACHMA | FWPC0_VLANSA) #define FWPC1(i) (FWPC10 + (i) * 0x10) +#define FWCP1_LTHFW GENMASK(16 + (RSWITCH_NUM_AGENTS - 1), 16) #define FWPC1_DDE BIT(0) -#define FWPBFC(i) (FWPBFC0 + (i) * 0x10) +#define FWPC2(i) (FWPC20 + (i) * 0x10) +#define FWCP2_LTWFW GENMASK(16 + (RSWITCH_NUM_AGENTS - 1), 16) + +#define FWPBFC(i) (FWPBFC0 + (i) * 0x10) +#define FWPBFC_PBDV GENMASK(RSWITCH_NUM_AGENTS - 1, 0) #define FWPBFCSDC(j, i) (FWPBFCSDC00 + (i) * 0x10 + (j) * 0x04) @@ -984,7 +979,6 @@ struct rswitch_gwca { DECLARE_BITMAP(used, RSWITCH_MAX_NUM_QUEUES); u32 tx_irq_bits[RSWITCH_NUM_IRQ_REGS]; u32 rx_irq_bits[RSWITCH_NUM_IRQ_REGS]; - int speed; }; #define NUM_QUEUES_PER_NDEV 2 diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 6658536a4e17..4cc85a36a1ab 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -154,6 +154,18 @@ config DWMAC_RZN1 the stmmac device driver. This support can make use of a custom MII converter PCS device. +config DWMAC_S32 + tristate "NXP S32G/S32R GMAC support" + default ARCH_S32 + depends on OF && (ARCH_S32 || COMPILE_TEST) + help + Support for ethernet controller on NXP S32CC SOCs. + + This selects NXP SoC glue layer support for the stmmac + device driver. This driver is used for the S32CC series + SOCs GMAC ethernet controller, ie. S32G2xx, S32G3xx and + S32R45. + config DWMAC_SOCFPGA tristate "SOCFPGA dwmac support" default ARCH_INTEL_SOCFPGA diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 2389fd261344..b26f0e79c2b3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_DWMAC_MESON) += dwmac-meson.o dwmac-meson8b.o obj-$(CONFIG_DWMAC_QCOM_ETHQOS) += dwmac-qcom-ethqos.o obj-$(CONFIG_DWMAC_ROCKCHIP) += dwmac-rk.o obj-$(CONFIG_DWMAC_RZN1) += dwmac-rzn1.o +obj-$(CONFIG_DWMAC_S32) += dwmac-s32.o obj-$(CONFIG_DWMAC_SOCFPGA) += dwmac-altr-socfpga.o obj-$(CONFIG_DWMAC_STARFIVE) += dwmac-starfive.o obj-$(CONFIG_DWMAC_STI) += dwmac-sti.o diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 1367fa5c9b8e..e25db747a81a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -257,6 +257,8 @@ struct stmmac_safety_stats { #define CSR_F_150M 150000000 #define CSR_F_250M 250000000 #define CSR_F_300M 300000000 +#define CSR_F_500M 500000000 +#define CSR_F_800M 800000000 #define MAC_CSR_H_FRQ_MASK 0x20 @@ -543,18 +545,8 @@ struct dma_features { #define STMMAC_VLAN_INSERT 0x2 #define STMMAC_VLAN_REPLACE 0x3 -extern const struct stmmac_desc_ops enh_desc_ops; -extern const struct stmmac_desc_ops ndesc_ops; - struct mac_device_info; -extern const struct stmmac_hwtimestamp stmmac_ptp; -extern const struct stmmac_hwtimestamp dwmac1000_ptp; -extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; - -extern const struct ptp_clock_info stmmac_ptp_clock_ops; -extern const struct ptp_clock_info dwmac1000_ptp_clock_ops; - struct mac_link { u32 caps; u32 speed_mask; @@ -641,8 +633,4 @@ void stmmac_dwmac4_set_mac(void __iomem *ioaddr, bool enable); void dwmac_dma_flush_tx_fifo(void __iomem *ioaddr); -extern const struct stmmac_mode_ops ring_mode_ops; -extern const struct stmmac_mode_ops chain_mode_ops; -extern const struct stmmac_desc_ops dwmac4_desc_ops; - #endif /* __COMMON_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 83290e707df5..bd4eb187f8c6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -181,24 +181,19 @@ static void dwc_qos_remove(struct platform_device *pdev) static void tegra_eqos_fix_speed(void *priv, unsigned int speed, unsigned int mode) { struct tegra_eqos *eqos = priv; - unsigned long rate = 125000000; bool needs_calibration = false; + long rate = 125000000; u32 value; int err; switch (speed) { case SPEED_1000: - needs_calibration = true; - rate = 125000000; - break; - case SPEED_100: needs_calibration = true; - rate = 25000000; - break; + fallthrough; case SPEED_10: - rate = 2500000; + rate = rgmii_clock(speed); break; default: diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 641f3cd019a3..43e0fbba4f77 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -186,7 +186,7 @@ static void imx_dwmac_fix_speed(void *priv, unsigned int speed, unsigned int mod { struct plat_stmmacenet_data *plat_dat; struct imx_priv_data *dwmac = priv; - unsigned long rate; + long rate; int err; plat_dat = dwmac->plat_dat; @@ -196,17 +196,8 @@ static void imx_dwmac_fix_speed(void *priv, unsigned int speed, unsigned int mod (plat_dat->mac_interface == PHY_INTERFACE_MODE_MII)) return; - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_10: - rate = 2500000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "invalid speed %u\n", speed); return; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index d94f0a150e93..ddee6154d40b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -31,27 +31,13 @@ struct intel_dwmac_data { static void kmb_eth_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) { struct intel_dwmac *dwmac = priv; - unsigned long rate; + long rate; int ret; - rate = clk_get_rate(dwmac->tx_clk); - - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - - case SPEED_100: - rate = 25000000; - break; - - case SPEED_10: - rate = 2500000; - break; - - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "Invalid speed\n"); - break; + return; } ret = clk_set_rate(dwmac->tx_clk, rate); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 901a3c1959fa..2a5b38723635 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -777,7 +777,7 @@ static void ethqos_ptp_clk_freq_config(struct stmmac_priv *priv) netdev_err(priv->dev, "Failed to max out clk_ptp_ref: %d\n", err); plat_dat->clk_ptp_rate = clk_get_rate(plat_dat->clk_ptp_ref); - netdev_dbg(priv->dev, "PTP rate %d\n", plat_dat->clk_ptp_rate); + netdev_dbg(priv->dev, "PTP rate %lu\n", plat_dat->clk_ptp_rate); } static int qcom_ethqos_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 8cb374668b74..a4dc89e23a68 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -1079,20 +1079,11 @@ static void rk3568_set_gmac_speed(struct rk_priv_data *bsp_priv, int speed) { struct clk *clk_mac_speed = bsp_priv->clks[RK_CLK_MAC_SPEED].clk; struct device *dev = &bsp_priv->pdev->dev; - unsigned long rate; + long rate; int ret; - switch (speed) { - case 10: - rate = 2500000; - break; - case 100: - rate = 25000000; - break; - case 1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dev, "unknown speed value for GMAC speed=%d", speed); return; } @@ -1540,20 +1531,11 @@ static void rv1126_set_rgmii_speed(struct rk_priv_data *bsp_priv, int speed) { struct clk *clk_mac_speed = bsp_priv->clks[RK_CLK_MAC_SPEED].clk; struct device *dev = &bsp_priv->pdev->dev; - unsigned long rate; + long rate; int ret; - switch (speed) { - case 10: - rate = 2500000; - break; - case 100: - rate = 25000000; - break; - case 1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dev, "unknown speed value for RGMII speed=%d", speed); return; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c new file mode 100644 index 000000000000..9cc0e5817416 --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NXP S32G/R GMAC glue layer + * + * Copyright 2019-2024 NXP + * + */ + +#include <linux/clk.h> +#include <linux/clk-provider.h> +#include <linux/device.h> +#include <linux/ethtool.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of_mdio.h> +#include <linux/of_address.h> +#include <linux/phy.h> +#include <linux/phylink.h> +#include <linux/platform_device.h> +#include <linux/stmmac.h> + +#include "stmmac_platform.h" + +#define GMAC_INTF_RATE_125M 125000000 /* 125MHz */ + +/* SoC PHY interface control register */ +#define PHY_INTF_SEL_MII 0x00 +#define PHY_INTF_SEL_SGMII 0x01 +#define PHY_INTF_SEL_RGMII 0x02 +#define PHY_INTF_SEL_RMII 0x08 + +struct s32_priv_data { + void __iomem *ioaddr; + void __iomem *ctrl_sts; + struct device *dev; + phy_interface_t *intf_mode; + struct clk *tx_clk; + struct clk *rx_clk; +}; + +static int s32_gmac_write_phy_intf_select(struct s32_priv_data *gmac) +{ + writel(PHY_INTF_SEL_RGMII, gmac->ctrl_sts); + + dev_dbg(gmac->dev, "PHY mode set to %s\n", phy_modes(*gmac->intf_mode)); + + return 0; +} + +static int s32_gmac_init(struct platform_device *pdev, void *priv) +{ + struct s32_priv_data *gmac = priv; + int ret; + + /* Set initial TX interface clock */ + ret = clk_prepare_enable(gmac->tx_clk); + if (ret) { + dev_err(&pdev->dev, "Can't enable tx clock\n"); + return ret; + } + ret = clk_set_rate(gmac->tx_clk, GMAC_INTF_RATE_125M); + if (ret) { + dev_err(&pdev->dev, "Can't set tx clock\n"); + goto err_tx_disable; + } + + /* Set initial RX interface clock */ + ret = clk_prepare_enable(gmac->rx_clk); + if (ret) { + dev_err(&pdev->dev, "Can't enable rx clock\n"); + goto err_tx_disable; + } + ret = clk_set_rate(gmac->rx_clk, GMAC_INTF_RATE_125M); + if (ret) { + dev_err(&pdev->dev, "Can't set rx clock\n"); + goto err_txrx_disable; + } + + /* Set interface mode */ + ret = s32_gmac_write_phy_intf_select(gmac); + if (ret) { + dev_err(&pdev->dev, "Can't set PHY interface mode\n"); + goto err_txrx_disable; + } + + return 0; + +err_txrx_disable: + clk_disable_unprepare(gmac->rx_clk); +err_tx_disable: + clk_disable_unprepare(gmac->tx_clk); + return ret; +} + +static void s32_gmac_exit(struct platform_device *pdev, void *priv) +{ + struct s32_priv_data *gmac = priv; + + clk_disable_unprepare(gmac->tx_clk); + clk_disable_unprepare(gmac->rx_clk); +} + +static void s32_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) +{ + struct s32_priv_data *gmac = priv; + long tx_clk_rate; + int ret; + + tx_clk_rate = rgmii_clock(speed); + if (tx_clk_rate < 0) { + dev_err(gmac->dev, "Unsupported/Invalid speed: %d\n", speed); + return; + } + + dev_dbg(gmac->dev, "Set tx clock to %ld Hz\n", tx_clk_rate); + ret = clk_set_rate(gmac->tx_clk, tx_clk_rate); + if (ret) + dev_err(gmac->dev, "Can't set tx clock\n"); +} + +static int s32_dwmac_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat; + struct device *dev = &pdev->dev; + struct stmmac_resources res; + struct s32_priv_data *gmac; + int ret; + + gmac = devm_kzalloc(&pdev->dev, sizeof(*gmac), GFP_KERNEL); + if (!gmac) + return -ENOMEM; + + gmac->dev = &pdev->dev; + + ret = stmmac_get_platform_resources(pdev, &res); + if (ret) + return dev_err_probe(dev, ret, + "Failed to get platform resources\n"); + + plat = devm_stmmac_probe_config_dt(pdev, res.mac); + if (IS_ERR(plat)) + return dev_err_probe(dev, PTR_ERR(plat), + "dt configuration failed\n"); + + /* PHY interface mode control reg */ + gmac->ctrl_sts = devm_platform_get_and_ioremap_resource(pdev, 1, NULL); + if (IS_ERR(gmac->ctrl_sts)) + return dev_err_probe(dev, PTR_ERR(gmac->ctrl_sts), + "S32CC config region is missing\n"); + + /* tx clock */ + gmac->tx_clk = devm_clk_get(&pdev->dev, "tx"); + if (IS_ERR(gmac->tx_clk)) + return dev_err_probe(dev, PTR_ERR(gmac->tx_clk), + "tx clock not found\n"); + + /* rx clock */ + gmac->rx_clk = devm_clk_get(&pdev->dev, "rx"); + if (IS_ERR(gmac->rx_clk)) + return dev_err_probe(dev, PTR_ERR(gmac->rx_clk), + "rx clock not found\n"); + + gmac->intf_mode = &plat->phy_interface; + gmac->ioaddr = res.addr; + + /* S32CC core feature set */ + plat->has_gmac4 = true; + plat->pmt = 1; + plat->flags |= STMMAC_FLAG_SPH_DISABLE; + plat->rx_fifo_size = 20480; + plat->tx_fifo_size = 20480; + + plat->init = s32_gmac_init; + plat->exit = s32_gmac_exit; + plat->fix_mac_speed = s32_fix_mac_speed; + + plat->bsp_priv = gmac; + + return stmmac_pltfr_probe(pdev, plat, &res); +} + +static const struct of_device_id s32_dwmac_match[] = { + { .compatible = "nxp,s32g2-dwmac" }, + { } +}; +MODULE_DEVICE_TABLE(of, s32_dwmac_match); + +static struct platform_driver s32_dwmac_driver = { + .probe = s32_dwmac_probe, + .remove = stmmac_pltfr_remove, + .driver = { + .name = "s32-dwmac", + .pm = &stmmac_pltfr_pm_ops, + .of_match_table = s32_dwmac_match, + }, +}; +module_platform_driver(s32_dwmac_driver); + +MODULE_AUTHOR("Jan Petrous (OSS) <jan.petrous@oss.nxp.com>"); +MODULE_DESCRIPTION("NXP S32G/R common chassis GMAC driver"); +MODULE_LICENSE("GPL"); + diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c index 421666279dd3..0a0a363d3730 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c @@ -34,24 +34,13 @@ struct starfive_dwmac { static void starfive_dwmac_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) { struct starfive_dwmac *dwmac = priv; - unsigned long rate; + long rate; int err; - rate = clk_get_rate(dwmac->clk_tx); - - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_10: - rate = 2500000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "invalid speed %u\n", speed); - break; + return; } err = clk_set_rate(dwmac->clk_tx, rate); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index a6ff02d905a9..eabc4da9e1a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -21,10 +21,7 @@ #include "stmmac_platform.h" -#define DWMAC_125MHZ 125000000 #define DWMAC_50MHZ 50000000 -#define DWMAC_25MHZ 25000000 -#define DWMAC_2_5MHZ 2500000 #define IS_PHY_IF_MODE_RGMII(iface) (iface == PHY_INTERFACE_MODE_RGMII || \ iface == PHY_INTERFACE_MODE_RGMII_ID || \ @@ -140,7 +137,7 @@ static void stih4xx_fix_retime_src(void *priv, u32 spd, unsigned int mode) struct sti_dwmac *dwmac = priv; u32 src = dwmac->tx_retime_src; u32 reg = dwmac->ctrl_reg; - u32 freq = 0; + long freq = 0; if (dwmac->interface == PHY_INTERFACE_MODE_MII) { src = TX_RETIME_SRC_TXCLK; @@ -153,19 +150,14 @@ static void stih4xx_fix_retime_src(void *priv, u32 spd, unsigned int mode) } } else if (IS_PHY_IF_MODE_RGMII(dwmac->interface)) { /* On GiGa clk source can be either ext or from clkgen */ - if (spd == SPEED_1000) { - freq = DWMAC_125MHZ; - } else { + freq = rgmii_clock(spd); + + if (spd != SPEED_1000 && freq > 0) /* Switch to clkgen for these speeds */ src = TX_RETIME_SRC_CLKGEN; - if (spd == SPEED_100) - freq = DWMAC_25MHZ; - else if (spd == SPEED_10) - freq = DWMAC_2_5MHZ; - } } - if (src == TX_RETIME_SRC_CLKGEN && freq) + if (src == TX_RETIME_SRC_CLKGEN && freq > 0) clk_set_rate(dwmac->clk, freq); regmap_update_bits(dwmac->regmap, reg, STIH4XX_RETIME_SRC_MASK, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index c25781874aa7..c36f90a782c5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -27,7 +27,7 @@ static void dwmac4_core_init(struct mac_device_info *hw, struct stmmac_priv *priv = netdev_priv(dev); void __iomem *ioaddr = hw->pcsr; u32 value = readl(ioaddr + GMAC_CONFIG); - u32 clk_rate; + unsigned long clk_rate; value |= GMAC_CORE_INIT; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h index 1ce6f43d545a..806555976496 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h @@ -144,4 +144,7 @@ /* TDS3 use for both format (read and write back) */ #define RDES3_OWN BIT(31) +extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; +extern const struct stmmac_desc_ops dwmac4_desc_ops; + #endif /* __DWMAC4_DESCS_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index a04a79003692..20027d3c25a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -493,4 +493,9 @@ #define XGMAC_RDES3_TSD BIT(6) #define XGMAC_RDES3_TSA BIT(4) +extern const struct stmmac_ops dwxgmac210_ops; +extern const struct stmmac_ops dwxlgmac2_ops; +extern const struct stmmac_dma_ops dwxgmac210_dma_ops; +extern const struct stmmac_desc_ops dwxgmac210_desc_ops; + #endif /* __STMMAC_DWXGMAC2_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index a72d336a8350..31bdbab9a46c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -9,6 +9,8 @@ #include "stmmac_fpe.h" #include "stmmac_ptp.h" #include "stmmac_est.h" +#include "dwmac4_descs.h" +#include "dwxgmac2.h" static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg) { @@ -265,7 +267,7 @@ static const struct stmmac_hwif_entry { .hwtimestamp = &stmmac_ptp, .ptp = &stmmac_ptp_clock_ops, .mode = NULL, - .tc = &dwxgmac_tc_ops, + .tc = &dwmac510_tc_ops, .mmc = &dwxgmac_mmc_ops, .est = &dwmac510_est_ops, .setup = dwxgmac2_setup, @@ -288,7 +290,7 @@ static const struct stmmac_hwif_entry { .hwtimestamp = &stmmac_ptp, .ptp = &stmmac_ptp_clock_ops, .mode = NULL, - .tc = &dwxgmac_tc_ops, + .tc = &dwmac510_tc_ops, .mmc = &dwxgmac_mmc_ops, .est = &dwmac510_est_ops, .setup = dwxlgmac2_setup, diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 64f8ed67dcc4..2f7295b6c1c5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -665,6 +665,15 @@ struct stmmac_regs_off { u32 est_off; }; +extern const struct stmmac_desc_ops enh_desc_ops; +extern const struct stmmac_desc_ops ndesc_ops; + +extern const struct stmmac_hwtimestamp stmmac_ptp; +extern const struct stmmac_hwtimestamp dwmac1000_ptp; + +extern const struct stmmac_mode_ops ring_mode_ops; +extern const struct stmmac_mode_ops chain_mode_ops; + extern const struct stmmac_ops dwmac100_ops; extern const struct stmmac_dma_ops dwmac100_dma_ops; extern const struct stmmac_ops dwmac1000_ops; @@ -676,14 +685,6 @@ extern const struct stmmac_dma_ops dwmac410_dma_ops; extern const struct stmmac_ops dwmac510_ops; extern const struct stmmac_tc_ops dwmac4_tc_ops; extern const struct stmmac_tc_ops dwmac510_tc_ops; -extern const struct stmmac_tc_ops dwxgmac_tc_ops; -extern const struct stmmac_ops dwxgmac210_ops; -extern const struct stmmac_ops dwxlgmac2_ops; -extern const struct stmmac_dma_ops dwxgmac210_dma_ops; -extern const struct stmmac_desc_ops dwxgmac210_desc_ops; -extern const struct stmmac_mmc_ops dwmac_mmc_ops; -extern const struct stmmac_mmc_ops dwxgmac_mmc_ops; -extern const struct stmmac_est_ops dwmac510_est_ops; #define GMAC_VERSION 0x00000020 /* GMAC CORE Version */ #define GMAC4_VERSION 0x00000110 /* GMAC4+ CORE Version */ diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h index 5d1ea3e07459..1cba39fb2c44 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc.h +++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h @@ -139,4 +139,7 @@ struct stmmac_counters { unsigned int mmc_rx_fpe_fragment_cntr; }; +extern const struct stmmac_mmc_ops dwmac_mmc_ops; +extern const struct stmmac_mmc_ops dwxgmac_mmc_ops; + #endif /* __MMC_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h index 7a858c566e7e..d247fa383a6e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h @@ -62,3 +62,5 @@ #define EST_SRWO BIT(0) #define EST_GCL_DATA 0x00000034 + +extern const struct stmmac_est_ops dwmac510_est_ops; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c81ea8cdfe6e..16b8bcfa8b11 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -301,7 +301,7 @@ static void stmmac_global_err(struct stmmac_priv *priv) */ static void stmmac_clk_csr_set(struct stmmac_priv *priv) { - u32 clk_rate; + unsigned long clk_rate; clk_rate = clk_get_rate(priv->plat->stmmac_clk); @@ -325,6 +325,10 @@ static void stmmac_clk_csr_set(struct stmmac_priv *priv) priv->clk_csr = STMMAC_CSR_150_250M; else if ((clk_rate >= CSR_F_250M) && (clk_rate <= CSR_F_300M)) priv->clk_csr = STMMAC_CSR_250_300M; + else if ((clk_rate >= CSR_F_300M) && (clk_rate < CSR_F_500M)) + priv->clk_csr = STMMAC_CSR_300_500M; + else if ((clk_rate >= CSR_F_500M) && (clk_rate < CSR_F_800M)) + priv->clk_csr = STMMAC_CSR_500_800M; } if (priv->plat->flags & STMMAC_FLAG_HAS_SUN8I) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e492..06e07e6e180b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -640,7 +640,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_info(&pdev->dev, "PTP uses main clock\n"); } else { plat->clk_ptp_rate = clk_get_rate(plat->clk_ptp_ref); - dev_dbg(&pdev->dev, "PTP rate %d\n", plat->clk_ptp_rate); + dev_dbg(&pdev->dev, "PTP rate %lu\n", plat->clk_ptp_rate); } plat->stmmac_rst = devm_reset_control_get_optional(&pdev->dev, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h index 4cc70480ce0f..3fe0e3a80e80 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h @@ -104,4 +104,7 @@ int dwmac1000_ptp_enable(struct ptp_clock_info *ptp, void dwmac1000_get_ptptime(void __iomem *ptpaddr, u64 *ptp_time); void dwmac1000_timestamp_interrupt(struct stmmac_priv *priv); +extern const struct ptp_clock_info stmmac_ptp_clock_ops; +extern const struct ptp_clock_info dwmac1000_ptp_clock_ops; + #endif /* __STMMAC_PTP_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 6a79e6a111ed..694d6ee14381 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -1284,14 +1284,3 @@ const struct stmmac_tc_ops dwmac510_tc_ops = { .query_caps = tc_query_caps, .setup_mqprio = tc_setup_dwmac510_mqprio, }; - -const struct stmmac_tc_ops dwxgmac_tc_ops = { - .init = tc_init, - .setup_cls_u32 = tc_setup_cls_u32, - .setup_cbs = tc_setup_cbs, - .setup_cls = tc_setup_cls, - .setup_taprio = tc_setup_taprio, - .setup_etf = tc_setup_etf, - .query_caps = tc_query_caps, - .setup_mqprio = tc_setup_dwmac510_mqprio, -}; diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index df6d35d41b97..d7459866d24c 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -3303,7 +3303,7 @@ static struct page *niu_find_rxpage(struct rx_ring_info *rp, u64 addr, addr &= PAGE_MASK; pp = &rp->rxhash[h]; for (; (p = *pp) != NULL; pp = &niu_next_page(p)) { - if (p->index == addr) { + if (p->private == addr) { *link = pp; goto found; } @@ -3318,7 +3318,7 @@ static void niu_hash_page(struct rx_ring_info *rp, struct page *page, u64 base) { unsigned int h = niu_hash_rxaddr(rp, base); - page->index = base; + page->private = base; niu_next_page(page) = rp->rxhash[h]; rp->rxhash[h] = page; } @@ -3400,11 +3400,11 @@ static int niu_rx_pkt_ignore(struct niu *np, struct rx_ring_info *rp) rcr_size = rp->rbr_sizes[(val & RCR_ENTRY_PKTBUFSZ) >> RCR_ENTRY_PKTBUFSZ_SHIFT]; - if ((page->index + PAGE_SIZE) - rcr_size == addr) { + if ((page->private + PAGE_SIZE) - rcr_size == addr) { *link = niu_next_page(page); - np->ops->unmap_page(np->device, page->index, + np->ops->unmap_page(np->device, page->private, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; __free_page(page); rp->rbr_refill_pending++; @@ -3469,11 +3469,11 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np, append_size = append_size - skb->len; niu_rx_skb_append(skb, page, off, append_size, rcr_size); - if ((page->index + rp->rbr_block_size) - rcr_size == addr) { + if ((page->private + rp->rbr_block_size) - rcr_size == addr) { *link = niu_next_page(page); - np->ops->unmap_page(np->device, page->index, + np->ops->unmap_page(np->device, page->private, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; rp->rbr_refill_pending++; } else @@ -3538,11 +3538,11 @@ static void niu_rbr_free(struct niu *np, struct rx_ring_info *rp) page = rp->rxhash[i]; while (page) { struct page *next = niu_next_page(page); - u64 base = page->index; + u64 base = page->private; np->ops->unmap_page(np->device, base, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; __free_page(page); @@ -6460,7 +6460,7 @@ static void niu_reset_buffers(struct niu *np) page = rp->rxhash[j]; while (page) { struct page *next = niu_next_page(page); - u64 base = page->index; + u64 base = page->private; base = base >> RBR_DESCR_ADDR_SHIFT; rp->rbr[k++] = cpu_to_le32(base); page = next; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index bc658bc60885..642155cb8315 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -235,7 +235,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } /* Update tunnel dst according to Geneve options. */ @@ -322,7 +322,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, len = skb->len; err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) - dev_sw_netstats_rx_add(geneve->dev, len); + dev_dstats_rx_add(geneve->dev, len); return; drop: @@ -387,14 +387,14 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (unlikely((!geneve->cfg.inner_proto_inherit && inner_proto != htons(ETH_P_TEB)))) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } @@ -1023,7 +1023,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); dev_kfree_skb(skb); - DEV_STATS_INC(dev, tx_dropped); + dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } } else { @@ -1202,7 +1202,7 @@ static void geneve_setup(struct net_device *dev) dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ dev->min_mtu = ETH_MIN_MTU; /* The max_mtu calculation does not take account of GENEVE diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index d2b3f5a59141..e3dcdeacc12c 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -177,8 +177,7 @@ static struct mctp_i2c_client *mctp_i2c_new_client(struct i2c_client *client) return mcli; err: if (mcli) { - if (mcli->client) - i2c_unregister_device(mcli->client); + i2c_unregister_device(mcli->client); kfree(mcli); } return ERR_PTR(rc); diff --git a/drivers/net/mdio/mdio-octeon.c b/drivers/net/mdio/mdio-octeon.c index 2beb83154d39..cb53dccbde1a 100644 --- a/drivers/net/mdio/mdio-octeon.c +++ b/drivers/net/mdio/mdio-octeon.c @@ -17,37 +17,20 @@ static int octeon_mdiobus_probe(struct platform_device *pdev) { struct cavium_mdiobus *bus; struct mii_bus *mii_bus; - struct resource *res_mem; - resource_size_t mdio_phys; - resource_size_t regsize; union cvmx_smix_en smi_en; - int err = -ENOENT; + int err; mii_bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*bus)); if (!mii_bus) return -ENOMEM; - res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res_mem == NULL) { - dev_err(&pdev->dev, "found no memory resource\n"); - return -ENXIO; - } - bus = mii_bus->priv; bus->mii_bus = mii_bus; - mdio_phys = res_mem->start; - regsize = resource_size(res_mem); - if (!devm_request_mem_region(&pdev->dev, mdio_phys, regsize, - res_mem->name)) { - dev_err(&pdev->dev, "request_mem_region failed\n"); - return -ENXIO; - } - - bus->register_base = devm_ioremap(&pdev->dev, mdio_phys, regsize); - if (!bus->register_base) { + bus->register_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(bus->register_base)) { dev_err(&pdev->dev, "dev_ioremap failed\n"); - return -ENOMEM; + return PTR_ERR(bus->register_base); } smi_en.u64 = 0; diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 4ea44a2f48f7..f422a2f666ef 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -36,6 +36,7 @@ #include <linux/inet.h> #include <linux/configfs.h> #include <linux/etherdevice.h> +#include <linux/u64_stats_sync.h> #include <linux/utsname.h> #include <linux/rtnetlink.h> @@ -90,6 +91,12 @@ static DEFINE_MUTEX(target_cleanup_list_lock); */ static struct console netconsole_ext; +struct netconsole_target_stats { + u64_stats_t xmit_drop_count; + u64_stats_t enomem_count; + struct u64_stats_sync syncp; +}; + /** * struct netconsole_target - Represents a configured netconsole target. * @list: Links this target into the target_list. @@ -97,6 +104,7 @@ static struct console netconsole_ext; * @userdata_group: Links to the userdata configfs hierarchy * @userdata_complete: Cached, formatted string of append * @userdata_length: String length of userdata_complete + * @stats: Packet send stats for the target. Used for debugging. * @enabled: On / off knob to enable / disable target. * Visible from userspace (read-write). * We maintain a strict 1:1 correspondence between this and @@ -124,6 +132,7 @@ struct netconsole_target { char userdata_complete[MAX_USERDATA_ENTRY_LENGTH * MAX_USERDATA_ITEMS]; size_t userdata_length; #endif + struct netconsole_target_stats stats; bool enabled; bool extended; bool release; @@ -262,6 +271,7 @@ static void netconsole_process_cleanups_core(void) * | remote_ip * | local_mac * | remote_mac + * | transmit_errors * | userdata/ * | <key>/ * | value @@ -371,6 +381,21 @@ static ssize_t remote_mac_show(struct config_item *item, char *buf) return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac); } +static ssize_t transmit_errors_show(struct config_item *item, char *buf) +{ + struct netconsole_target *nt = to_target(item); + u64 xmit_drop_count, enomem_count; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&nt->stats.syncp); + xmit_drop_count = u64_stats_read(&nt->stats.xmit_drop_count); + enomem_count = u64_stats_read(&nt->stats.enomem_count); + } while (u64_stats_fetch_retry(&nt->stats.syncp, start)); + + return sysfs_emit(buf, "%llu\n", xmit_drop_count + enomem_count); +} + /* * This one is special -- targets created through the configfs interface * are not enabled (and the corresponding netpoll activated) by default. @@ -842,6 +867,7 @@ CONFIGFS_ATTR(, remote_ip); CONFIGFS_ATTR_RO(, local_mac); CONFIGFS_ATTR(, remote_mac); CONFIGFS_ATTR(, release); +CONFIGFS_ATTR_RO(, transmit_errors); static struct configfs_attribute *netconsole_target_attrs[] = { &attr_enabled, @@ -854,6 +880,7 @@ static struct configfs_attribute *netconsole_target_attrs[] = { &attr_remote_ip, &attr_local_mac, &attr_remote_mac, + &attr_transmit_errors, NULL, }; @@ -1058,6 +1085,33 @@ static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; +/** + * send_udp - Wrapper for netpoll_send_udp that counts errors + * @nt: target to send message to + * @msg: message to send + * @len: length of message + * + * Calls netpoll_send_udp and classifies the return value. If an error + * occurred it increments statistics in nt->stats accordingly. + * Only calls netpoll_send_udp if CONFIG_NETCONSOLE_DYNAMIC is disabled. + */ +static void send_udp(struct netconsole_target *nt, const char *msg, int len) +{ + int result = netpoll_send_udp(&nt->np, msg, len); + + if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) { + if (result == NET_XMIT_DROP) { + u64_stats_update_begin(&nt->stats.syncp); + u64_stats_inc(&nt->stats.xmit_drop_count); + u64_stats_update_end(&nt->stats.syncp); + } else if (result == -ENOMEM) { + u64_stats_update_begin(&nt->stats.syncp); + u64_stats_inc(&nt->stats.enomem_count); + u64_stats_update_end(&nt->stats.syncp); + } + } +} + static void send_msg_no_fragmentation(struct netconsole_target *nt, const char *msg, int msg_len, @@ -1085,7 +1139,7 @@ static void send_msg_no_fragmentation(struct netconsole_target *nt, MAX_PRINT_CHUNK - msg_len, "%s", userdata); - netpoll_send_udp(&nt->np, buf, msg_len); + send_udp(nt, buf, msg_len); } static void append_release(char *buf) @@ -1178,7 +1232,7 @@ static void send_fragmented_body(struct netconsole_target *nt, char *buf, this_offset += this_chunk; } - netpoll_send_udp(&nt->np, buf, this_header + this_offset); + send_udp(nt, buf, this_header + this_offset); offset += this_offset; } } @@ -1288,7 +1342,7 @@ static void write_msg(struct console *con, const char *msg, unsigned int len) tmp = msg; for (left = len; left;) { frag = min(left, MAX_PRINT_CHUNK); - netpoll_send_udp(&nt->np, tmp, frag); + send_udp(nt, tmp, frag); tmp += frag; left -= frag; } diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c index b79aedad855b..767a8c0714ac 100644 --- a/drivers/net/pcs/pcs-lynx.c +++ b/drivers/net/pcs/pcs-lynx.c @@ -35,6 +35,27 @@ enum sgmii_speed { #define phylink_pcs_to_lynx(pl_pcs) container_of((pl_pcs), struct lynx_pcs, pcs) #define lynx_to_phylink_pcs(lynx) (&(lynx)->pcs) +static unsigned int lynx_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_2500BASEX: + return LINK_INBAND_DISABLE; + + case PHY_INTERFACE_MODE_USXGMII: + return LINK_INBAND_ENABLE; + + default: + return 0; + } +} + static void lynx_pcs_get_state_usxgmii(struct mdio_device *pcs, struct phylink_link_state *state) { @@ -306,6 +327,7 @@ static void lynx_pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, } static const struct phylink_pcs_ops lynx_pcs_phylink_ops = { + .pcs_inband_caps = lynx_pcs_inband_caps, .pcs_get_state = lynx_pcs_get_state, .pcs_config = lynx_pcs_config, .pcs_an_restart = lynx_pcs_an_restart, diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index 4f63abe638c4..7de804535229 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -88,6 +88,21 @@ static struct mtk_pcs_lynxi *pcs_to_mtk_pcs_lynxi(struct phylink_pcs *pcs) return container_of(pcs, struct mtk_pcs_lynxi, pcs); } +static unsigned int mtk_pcs_lynxi_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + default: + return 0; + } +} + static void mtk_pcs_lynxi_get_state(struct phylink_pcs *pcs, struct phylink_link_state *state) { @@ -241,6 +256,7 @@ static void mtk_pcs_lynxi_disable(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mtk_pcs_lynxi_ops = { + .pcs_inband_caps = mtk_pcs_lynxi_inband_caps, .pcs_get_state = mtk_pcs_lynxi_get_state, .pcs_config = mtk_pcs_lynxi_config, .pcs_an_restart = mtk_pcs_lynxi_restart_an, diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 7246a910728d..f70ca39f0905 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -567,6 +567,33 @@ static int xpcs_validate(struct phylink_pcs *pcs, unsigned long *supported, return 0; } +static unsigned int xpcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs); + const struct dw_xpcs_compat *compat; + + compat = xpcs_find_compat(xpcs, interface); + if (!compat) + return 0; + + switch (compat->an_mode) { + case DW_AN_C73: + return LINK_INBAND_ENABLE; + + case DW_AN_C37_SGMII: + case DW_AN_C37_1000BASEX: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + case DW_10GBASER: + case DW_2500BASEX: + return LINK_INBAND_DISABLE; + + default: + return 0; + } +} + void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) { const struct dw_xpcs_compat *compat; @@ -1306,6 +1333,7 @@ static const struct dw_xpcs_desc xpcs_desc_list[] = { static const struct phylink_pcs_ops xpcs_phylink_ops = { .pcs_validate = xpcs_validate, + .pcs_inband_caps = xpcs_inband_caps, .pcs_pre_config = xpcs_pre_config, .pcs_config = xpcs_config, .pcs_get_state = xpcs_get_state, diff --git a/drivers/net/phy/bcm84881.c b/drivers/net/phy/bcm84881.c index 97da3aee4942..47405bded677 100644 --- a/drivers/net/phy/bcm84881.c +++ b/drivers/net/phy/bcm84881.c @@ -235,11 +235,21 @@ static int bcm84881_read_status(struct phy_device *phydev) return genphy_c45_read_mdix(phydev); } +/* The Broadcom BCM84881 in the Methode DM7052 is unable to provide a SGMII + * or 802.3z control word, so inband will not work. + */ +static unsigned int bcm84881_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + return LINK_INBAND_DISABLE; +} + static struct phy_driver bcm84881_drivers[] = { { .phy_id = 0xae025150, .phy_id_mask = 0xfffffff0, .name = "Broadcom BCM84881", + .inband_caps = bcm84881_inband_caps, .config_init = bcm84881_config_init, .probe = bcm84881_probe, .get_features = bcm84881_get_features, diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index cf8b6d0bfaa9..334c17a68edd 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -22,8 +22,6 @@ #define DP83826C_PHY_ID 0x2000a130 #define DP83826NC_PHY_ID 0x2000a110 -#define DP83822_DEVADDR 0x1f - #define MII_DP83822_CTRL_2 0x0a #define MII_DP83822_PHYSTS 0x10 #define MII_DP83822_PHYSCR 0x11 @@ -32,6 +30,7 @@ #define MII_DP83822_FCSCR 0x14 #define MII_DP83822_RCSR 0x17 #define MII_DP83822_RESET_CTRL 0x1f +#define MII_DP83822_IOCTRL2 0x463 #define MII_DP83822_GENCFG 0x465 #define MII_DP83822_SOR1 0x467 @@ -106,6 +105,18 @@ #define DP83822_RX_CLK_SHIFT BIT(12) #define DP83822_TX_CLK_SHIFT BIT(11) +/* IOCTRL2 bits */ +#define DP83822_IOCTRL2_GPIO2_CLK_SRC GENMASK(6, 4) +#define DP83822_IOCTRL2_GPIO2_CTRL GENMASK(2, 0) +#define DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF GENMASK(1, 0) + +#define DP83822_CLK_SRC_MAC_IF 0x0 +#define DP83822_CLK_SRC_XI 0x1 +#define DP83822_CLK_SRC_INT_REF 0x2 +#define DP83822_CLK_SRC_RMII_MASTER_MODE_REF 0x4 +#define DP83822_CLK_SRC_FREE_RUNNING 0x6 +#define DP83822_CLK_SRC_RECOVERED 0x7 + /* SOR1 mode */ #define DP83822_STRAP_MODE1 0 #define DP83822_STRAP_MODE2 BIT(0) @@ -141,6 +152,8 @@ struct dp83822_private { u8 cfg_dac_minus; u8 cfg_dac_plus; struct ethtool_wolinfo wol; + bool set_gpio2_clk_out; + u32 gpio2_clk_out; }; static int dp83822_config_wol(struct phy_device *phydev, @@ -159,14 +172,14 @@ static int dp83822_config_wol(struct phy_device *phydev, /* MAC addresses start with byte 5, but stored in mac[0]. * 822 PHYs store bytes 4|5, 2|3, 0|1 */ - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA1, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA1, (mac[1] << 8) | mac[0]); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA2, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA2, (mac[3] << 8) | mac[2]); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA3, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA3, (mac[5] << 8) | mac[4]); - value = phy_read_mmd(phydev, DP83822_DEVADDR, + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (wol->wolopts & WAKE_MAGIC) value |= DP83822_WOL_MAGIC_EN; @@ -174,13 +187,13 @@ static int dp83822_config_wol(struct phy_device *phydev, value &= ~DP83822_WOL_MAGIC_EN; if (wol->wolopts & WAKE_MAGICSECURE) { - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP1, (wol->sopass[1] << 8) | wol->sopass[0]); - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP2, (wol->sopass[3] << 8) | wol->sopass[2]); - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP3, (wol->sopass[5] << 8) | wol->sopass[4]); value |= DP83822_WOL_SECURE_ON; @@ -194,10 +207,10 @@ static int dp83822_config_wol(struct phy_device *phydev, value |= DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL | DP83822_WOL_CLR_INDICATION; - return phy_write_mmd(phydev, DP83822_DEVADDR, + return phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, value); } else { - return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, DP83822_WOL_EN | DP83822_WOL_MAGIC_EN | @@ -226,23 +239,23 @@ static void dp83822_get_wol(struct phy_device *phydev, wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE); wol->wolopts = 0; - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (value & DP83822_WOL_MAGIC_EN) wol->wolopts |= WAKE_MAGIC; if (value & DP83822_WOL_SECURE_ON) { - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP1); wol->sopass[0] = (sopass_val & 0xff); wol->sopass[1] = (sopass_val >> 8); - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP2); wol->sopass[2] = (sopass_val & 0xff); wol->sopass[3] = (sopass_val >> 8); - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP3); wol->sopass[4] = (sopass_val & 0xff); wol->sopass[5] = (sopass_val >> 8); @@ -415,6 +428,15 @@ static int dp83822_config_init(struct phy_device *phydev) int err = 0; int bmcr; + if (dp83822->set_gpio2_clk_out) + phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL2, + DP83822_IOCTRL2_GPIO2_CTRL | + DP83822_IOCTRL2_GPIO2_CLK_SRC, + FIELD_PREP(DP83822_IOCTRL2_GPIO2_CTRL, + DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF) | + FIELD_PREP(DP83822_IOCTRL2_GPIO2_CLK_SRC, + dp83822->gpio2_clk_out)); + if (phy_interface_is_rgmii(phydev)) { rx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0, true); @@ -430,18 +452,18 @@ static int dp83822_config_init(struct phy_device *phydev) if (tx_int_delay <= 0) rgmii_delay |= DP83822_TX_CLK_SHIFT; - err = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + err = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RX_CLK_SHIFT | DP83822_TX_CLK_SHIFT, rgmii_delay); if (err) return err; - err = phy_set_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RGMII_MODE_EN); if (err) return err; } else { - err = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RGMII_MODE_EN); if (err) @@ -496,7 +518,7 @@ static int dp83822_config_init(struct phy_device *phydev) return err; if (dp83822->fx_signal_det_low) { - err = phy_set_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_GENCFG, DP83822_SIG_DET_LOW); if (err) @@ -514,10 +536,10 @@ static int dp8382x_config_rmii_mode(struct phy_device *phydev) if (!device_property_read_string(dev, "ti,rmii-mode", &of_val)) { if (strcmp(of_val, "master") == 0) { - ret = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_SEL); } else if (strcmp(of_val, "slave") == 0) { - ret = phy_set_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_SEL); } else { phydev_err(phydev, "Invalid value for ti,rmii-mode property (%s)\n", @@ -539,7 +561,7 @@ static int dp83826_config_init(struct phy_device *phydev) int ret; if (phydev->interface == PHY_INTERFACE_MODE_RMII) { - ret = phy_set_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_EN); if (ret) return ret; @@ -548,7 +570,7 @@ static int dp83826_config_init(struct phy_device *phydev) if (ret) return ret; } else { - ret = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_EN); if (ret) return ret; @@ -560,7 +582,7 @@ static int dp83826_config_init(struct phy_device *phydev) FIELD_GET(DP83826_CFG_DAC_MINUS_MDIX_5_TO_4, dp83822->cfg_dac_minus)); mask = DP83826_VOD_CFG1_MINUS_MDIX_MASK | DP83826_VOD_CFG1_MINUS_MDI_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG1, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG1, mask, val); if (ret) return ret; @@ -568,7 +590,7 @@ static int dp83826_config_init(struct phy_device *phydev) FIELD_GET(DP83826_CFG_DAC_MINUS_MDIX_3_TO_0, dp83822->cfg_dac_minus)); mask = DP83826_VOD_CFG2_MINUS_MDIX_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG2, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG2, mask, val); if (ret) return ret; } @@ -577,7 +599,7 @@ static int dp83826_config_init(struct phy_device *phydev) val = FIELD_PREP(DP83826_VOD_CFG2_PLUS_MDIX_MASK, dp83822->cfg_dac_plus) | FIELD_PREP(DP83826_VOD_CFG2_PLUS_MDI_MASK, dp83822->cfg_dac_plus); mask = DP83826_VOD_CFG2_PLUS_MDIX_MASK | DP83826_VOD_CFG2_PLUS_MDI_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG2, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG2, mask, val); if (ret) return ret; } @@ -613,6 +635,7 @@ static int dp83822_of_init(struct phy_device *phydev) { struct dp83822_private *dp83822 = phydev->priv; struct device *dev = &phydev->mdio.dev; + const char *of_val; /* Signal detection for the PHY is only enabled if the FX_EN and the * SD_EN pins are strapped. Signal detection can only enabled if FX_EN @@ -625,6 +648,29 @@ static int dp83822_of_init(struct phy_device *phydev) dp83822->fx_enabled = device_property_present(dev, "ti,fiber-mode"); + if (!device_property_read_string(dev, "ti,gpio2-clk-out", &of_val)) { + if (strcmp(of_val, "mac-if") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_MAC_IF; + } else if (strcmp(of_val, "xi") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_XI; + } else if (strcmp(of_val, "int-ref") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_INT_REF; + } else if (strcmp(of_val, "rmii-master-mode-ref") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_RMII_MASTER_MODE_REF; + } else if (strcmp(of_val, "free-running") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_FREE_RUNNING; + } else if (strcmp(of_val, "recovered") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_RECOVERED; + } else { + phydev_err(phydev, + "Invalid value for ti,gpio2-clk-out property (%s)\n", + of_val); + return -EINVAL; + } + + dp83822->set_gpio2_clk_out = true; + } + return 0; } @@ -673,7 +719,7 @@ static int dp83822_read_straps(struct phy_device *phydev) int fx_enabled, fx_sd_enable; int val; - val = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_SOR1); + val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_SOR1); if (val < 0) return val; @@ -748,7 +794,7 @@ static int dp83822_suspend(struct phy_device *phydev) { int value; - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (!(value & DP83822_WOL_EN)) genphy_suspend(phydev); @@ -762,9 +808,9 @@ static int dp83822_resume(struct phy_device *phydev) genphy_resume(phydev); - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, value | + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, value | DP83822_WOL_CLR_INDICATION); return 0; diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index cd50cd6a7f75..ffe223ad9e5f 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -717,6 +717,48 @@ static int marvell_config_aneg_fiber(struct phy_device *phydev) return genphy_check_and_restart_aneg(phydev, changed); } +static unsigned int m88e1111_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + /* In 1000base-X and SGMII modes, the inband mode can be changed + * through the Fibre page BMCR ANENABLE bit. + */ + if (interface == PHY_INTERFACE_MODE_1000BASEX || + interface == PHY_INTERFACE_MODE_SGMII) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE | + LINK_INBAND_BYPASS; + + return 0; +} + +static int m88e1111_config_inband(struct phy_device *phydev, unsigned int modes) +{ + u16 extsr, bmcr; + int err; + + if (phydev->interface != PHY_INTERFACE_MODE_1000BASEX && + phydev->interface != PHY_INTERFACE_MODE_SGMII) + return -EINVAL; + + if (modes == LINK_INBAND_BYPASS) + extsr = MII_M1111_HWCFG_SERIAL_AN_BYPASS; + else + extsr = 0; + + if (modes == LINK_INBAND_DISABLE) + bmcr = 0; + else + bmcr = BMCR_ANENABLE; + + err = phy_modify(phydev, MII_M1111_PHY_EXT_SR, + MII_M1111_HWCFG_SERIAL_AN_BYPASS, extsr); + if (err < 0) + return extsr; + + return phy_modify_paged(phydev, MII_MARVELL_FIBER_PAGE, MII_BMCR, + BMCR_ANENABLE, bmcr); +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int extsr = phy_read(phydev, MII_M1111_PHY_EXT_SR); @@ -1508,7 +1550,6 @@ static int m88e1540_get_fld(struct phy_device *phydev, u8 *msecs) static int m88e1540_set_fld(struct phy_device *phydev, const u8 *msecs) { - struct ethtool_keee eee; int val, ret; if (*msecs == ETHTOOL_PHY_FAST_LINK_DOWN_OFF) @@ -1518,8 +1559,7 @@ static int m88e1540_set_fld(struct phy_device *phydev, const u8 *msecs) /* According to the Marvell data sheet EEE must be disabled for * Fast Link Down detection to work properly */ - ret = genphy_c45_ethtool_get_eee(phydev, &eee); - if (!ret && eee.eee_enabled) { + if (phydev->eee_cfg.eee_enabled) { phydev_warn(phydev, "Fast Link Down detection requires EEE to be disabled!\n"); return -EBUSY; } @@ -3677,6 +3717,8 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1112", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1112_config_init, .config_aneg = marvell_config_aneg, .config_intr = marvell_config_intr, @@ -3698,6 +3740,8 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, @@ -3721,6 +3765,8 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1111 (Finisar)", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 944ae98ad110..0dac08e85304 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -1469,18 +1469,17 @@ EXPORT_SYMBOL_GPL(genphy_c45_plca_get_status); * @phydev: target phy_device struct * @adv: variable to store advertised linkmodes * @lp: variable to store LP advertised linkmodes - * @is_enabled: variable to store EEE enabled/disabled configuration value * * Description: this function will read local and link partner PHY * advertisements. Compare them return current EEE state. */ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled) + unsigned long *lp) { __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp_adv) = {}; __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp_lp) = {}; __ETHTOOL_DECLARE_LINK_MODE_MASK(common); - bool eee_enabled, eee_active; + bool eee_active; int ret; ret = genphy_c45_read_eee_adv(phydev, tmp_adv); @@ -1491,9 +1490,8 @@ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, if (ret) return ret; - eee_enabled = !linkmode_empty(tmp_adv); linkmode_and(common, tmp_adv, tmp_lp); - if (eee_enabled && !linkmode_empty(common)) + if (!linkmode_empty(tmp_adv) && !linkmode_empty(common)) eee_active = phy_check_valid(phydev->speed, phydev->duplex, common); else @@ -1503,8 +1501,6 @@ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, linkmode_copy(adv, tmp_adv); if (lp) linkmode_copy(lp, tmp_lp); - if (is_enabled) - *is_enabled = eee_enabled; return eee_active; } @@ -1521,15 +1517,13 @@ EXPORT_SYMBOL(genphy_c45_eee_is_active); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data) { - bool is_enabled; int ret; ret = genphy_c45_eee_is_active(phydev, data->advertised, - data->lp_advertised, &is_enabled); + data->lp_advertised); if (ret < 0) return ret; - data->eee_enabled = is_enabled; data->eee_active = phydev->eee_active; linkmode_copy(data->supported, phydev->supported_eee); diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 0d20b534122b..e4b04cdaa995 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -988,8 +988,7 @@ static int phy_check_link_status(struct phy_device *phydev) if (phydev->link && phydev->state != PHY_RUNNING) { phy_check_downshift(phydev); phydev->state = PHY_RUNNING; - err = genphy_c45_eee_is_active(phydev, - NULL, NULL, NULL); + err = genphy_c45_eee_is_active(phydev, NULL, NULL); phydev->eee_active = err > 0; phydev->enable_tx_lpi = phydev->eee_cfg.tx_lpi_enabled && phydev->eee_active; @@ -1006,6 +1005,59 @@ static int phy_check_link_status(struct phy_device *phydev) } /** + * phy_inband_caps - query which in-band signalling modes are supported + * @phydev: a pointer to a &struct phy_device + * @interface: the interface mode for the PHY + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * by the PHY for this interface mode. + */ +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + if (phydev->drv && phydev->drv->inband_caps) + return phydev->drv->inband_caps(phydev, interface); + + return 0; +} +EXPORT_SYMBOL_GPL(phy_inband_caps); + +/** + * phy_config_inband - configure the desired PHY in-band mode + * @phydev: the phy_device struct + * @modes: in-band modes to configure + * + * Description: disables, enables or enables-with-bypass in-band signalling + * between the PHY and host system. + * + * Returns: zero on success, or negative errno value. + */ +int phy_config_inband(struct phy_device *phydev, unsigned int modes) +{ + int err; + + if (!!(modes & LINK_INBAND_DISABLE) + + !!(modes & LINK_INBAND_ENABLE) + + !!(modes & LINK_INBAND_BYPASS) != 1) + return -EINVAL; + + mutex_lock(&phydev->lock); + if (!phydev->drv) + err = -EIO; + else if (!phydev->drv->config_inband) + err = -EOPNOTSUPP; + else + err = phydev->drv->config_inband(phydev, modes); + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_config_inband); + +/** * _phy_start_aneg - start auto-negotiation for this PHY device * @phydev: the phy_device struct * @@ -1605,7 +1657,7 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable) if (!phydev->drv) return -EIO; - ret = genphy_c45_eee_is_active(phydev, NULL, NULL, NULL); + ret = genphy_c45_eee_is_active(phydev, NULL, NULL); if (ret < 0) return ret; if (!ret) @@ -1649,8 +1701,8 @@ EXPORT_SYMBOL(phy_get_eee_err); * @phydev: target phy_device struct * @data: ethtool_keee data * - * Description: reports the Supported/Advertisement/LP Advertisement - * capabilities, etc. + * Description: get the current EEE settings, filling in all members of + * @data. */ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data) { diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b26bb33cd1d4..1a908af4175b 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -32,6 +32,7 @@ #include <linux/phy_link_topology.h> #include <linux/pse-pd/pse.h> #include <linux/property.h> +#include <linux/ptp_clock_kernel.h> #include <linux/rtnetlink.h> #include <linux/sfp.h> #include <linux/skbuff.h> @@ -1998,6 +1999,15 @@ void phy_detach(struct phy_device *phydev) phy_suspend(phydev); if (dev) { + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + /* Disable timestamp if it is the one selected */ + if (hwprov && hwprov->phydev == phydev) { + rcu_assign_pointer(dev->hwprov, NULL); + kfree_rcu(hwprov, rcu_head); + } + phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; phy_link_topo_del_phy(dev, phydev); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 30a654e98352..6d50c2fdb190 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -56,7 +56,8 @@ struct phylink { struct phy_device *phydev; phy_interface_t link_interface; /* PHY_INTERFACE_xxx */ u8 cfg_link_an_mode; /* MLO_AN_xxx */ - u8 cur_link_an_mode; + u8 req_link_an_mode; /* Requested MLO_AN_xxx mode */ + u8 act_link_an_mode; /* Active MLO_AN_xxx mode */ u8 link_port; /* The current non-phy ethtool port */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); @@ -74,6 +75,7 @@ struct phylink { struct mutex state_mutex; struct phylink_link_state phy_state; + unsigned int phy_ib_mode; struct work_struct resolve; unsigned int pcs_neg_mode; unsigned int pcs_state; @@ -174,6 +176,24 @@ static const char *phylink_an_mode_str(unsigned int mode) return mode < ARRAY_SIZE(modestr) ? modestr[mode] : "unknown"; } +static const char *phylink_pcs_mode_str(unsigned int mode) +{ + if (!mode) + return "none"; + + if (mode & PHYLINK_PCS_NEG_OUTBAND) + return "outband"; + + if (mode & PHYLINK_PCS_NEG_INBAND) { + if (mode & PHYLINK_PCS_NEG_ENABLED) + return "inband,an-enabled"; + else + return "inband,an-disabled"; + } + + return "unknown"; +} + static unsigned int phylink_interface_signal_rate(phy_interface_t interface) { switch (interface) { @@ -971,6 +991,15 @@ static void phylink_resolve_an_pause(struct phylink_link_state *state) } } +static unsigned int phylink_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + if (pcs && pcs->ops->pcs_inband_caps) + return pcs->ops->pcs_inband_caps(pcs, interface); + + return 0; +} + static void phylink_pcs_pre_config(struct phylink_pcs *pcs, phy_interface_t interface) { @@ -1024,6 +1053,24 @@ static void phylink_pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, pcs->ops->pcs_link_up(pcs, neg_mode, interface, speed, duplex); } +/* Query inband for a specific interface mode, asking the MAC for the + * PCS which will be used to handle the interface mode. + */ +static unsigned int phylink_inband_caps(struct phylink *pl, + phy_interface_t interface) +{ + struct phylink_pcs *pcs; + + if (!pl->mac_ops->mac_select_pcs) + return 0; + + pcs = pl->mac_ops->mac_select_pcs(pl->config, interface); + if (!pcs) + return 0; + + return phylink_pcs_inband_caps(pcs, interface); +} + static void phylink_pcs_poll_stop(struct phylink *pl) { if (pl->cfg_link_an_mode == MLO_AN_INBAND) @@ -1065,13 +1112,13 @@ static void phylink_mac_config(struct phylink *pl, phylink_dbg(pl, "%s: mode=%s/%s/%s adv=%*pb pause=%02x\n", - __func__, phylink_an_mode_str(pl->cur_link_an_mode), + __func__, phylink_an_mode_str(pl->act_link_an_mode), phy_modes(st.interface), phy_rate_matching_to_str(st.rate_matching), __ETHTOOL_LINK_MODE_MASK_NBITS, st.advertising, st.pause); - pl->mac_ops->mac_config(pl->config, pl->cur_link_an_mode, &st); + pl->mac_ops->mac_config(pl->config, pl->act_link_an_mode, &st); } static void phylink_pcs_an_restart(struct phylink *pl) @@ -1079,13 +1126,14 @@ static void phylink_pcs_an_restart(struct phylink *pl) if (pl->pcs && linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, pl->link_config.advertising) && phy_interface_mode_is_8023z(pl->link_config.interface) && - phylink_autoneg_inband(pl->cur_link_an_mode)) + phylink_autoneg_inband(pl->act_link_an_mode)) pl->pcs->ops->pcs_an_restart(pl->pcs); } /** * phylink_pcs_neg_mode() - helper to determine PCS inband mode - * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. + * @pl: a pointer to a &struct phylink returned from phylink_create() + * @pcs: a pointer to &struct phylink_pcs * @interface: interface mode to be used * @advertising: adertisement ethtool link mode mask * @@ -1102,11 +1150,21 @@ static void phylink_pcs_an_restart(struct phylink *pl) * Note: this is for cases where the PCS itself is involved in negotiation * (e.g. Clause 37, SGMII and similar) not Clause 73. */ -static unsigned int phylink_pcs_neg_mode(unsigned int mode, - phy_interface_t interface, - const unsigned long *advertising) +static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, + phy_interface_t interface, + const unsigned long *advertising) { - unsigned int neg_mode; + unsigned int pcs_ib_caps = 0; + unsigned int phy_ib_caps = 0; + unsigned int neg_mode, mode; + enum { + INBAND_CISCO_SGMII, + INBAND_BASEX, + } type; + + mode = pl->req_link_an_mode; + + pl->phy_ib_mode = 0; switch (interface) { case PHY_INTERFACE_MODE_SGMII: @@ -1119,10 +1177,7 @@ static unsigned int phylink_pcs_neg_mode(unsigned int mode, * inband communication. Note: there exist PHYs that run * with SGMII but do not send the inband data. */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + type = INBAND_CISCO_SGMII; break; case PHY_INTERFACE_MODE_1000BASEX: @@ -1133,21 +1188,143 @@ static unsigned int phylink_pcs_neg_mode(unsigned int mode, * as well, but drivers may not support this, so may * need to override this. */ - if (!phylink_autoneg_inband(mode)) + type = INBAND_BASEX; + break; + + default: + pl->pcs_neg_mode = PHYLINK_PCS_NEG_NONE; + pl->act_link_an_mode = mode; + return; + } + + if (pcs) + pcs_ib_caps = phylink_pcs_inband_caps(pcs, interface); + + if (pl->phydev) + phy_ib_caps = phy_inband_caps(pl->phydev, interface); + + phylink_dbg(pl, "interface %s inband modes: pcs=%02x phy=%02x\n", + phy_modes(interface), pcs_ib_caps, phy_ib_caps); + + if (!phylink_autoneg_inband(mode)) { + bool pcs_ib_only = false; + bool phy_ib_only = false; + + if (pcs_ib_caps && pcs_ib_caps != LINK_INBAND_DISABLE) { + /* PCS supports reporting in-band capabilities, and + * supports more than disable mode. + */ + if (pcs_ib_caps & LINK_INBAND_DISABLE) + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + else if (pcs_ib_caps & LINK_INBAND_ENABLE) + pcs_ib_only = true; + } + + if (phy_ib_caps && phy_ib_caps != LINK_INBAND_DISABLE) { + /* PHY supports in-band capabilities, and supports + * more than disable mode. + */ + if (phy_ib_caps & LINK_INBAND_DISABLE) + pl->phy_ib_mode = LINK_INBAND_DISABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + else if (phy_ib_caps & LINK_INBAND_ENABLE) + phy_ib_only = true; + } + + /* If either the PCS or PHY requires inband to be enabled, + * this is an invalid configuration. Provide a diagnostic + * message for this case, but don't try to force the issue. + */ + if (pcs_ib_only || phy_ib_only) + phylink_warn(pl, + "firmware wants %s mode, but %s%s%s requires inband\n", + phylink_an_mode_str(mode), + pcs_ib_only ? "PCS" : "", + pcs_ib_only && phy_ib_only ? " and " : "", + phy_ib_only ? "PHY" : ""); + + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + } else if (type == INBAND_CISCO_SGMII || pl->phydev) { + /* For SGMII modes which are designed to be used with PHYs, or + * Base-X with a PHY, we try to use in-band mode where-ever + * possible. However, there are some PHYs e.g. BCM84881 which + * do not support in-band. + */ + const unsigned int inband_ok = LINK_INBAND_ENABLE | + LINK_INBAND_BYPASS; + const unsigned int outband_ok = LINK_INBAND_DISABLE | + LINK_INBAND_BYPASS; + /* PCS PHY + * D E D E + * 0 0 0 0 no information inband enabled + * 1 0 0 0 pcs doesn't support outband + * 0 1 0 0 pcs required inband enabled + * 1 1 0 0 pcs optional inband enabled + * 0 0 1 0 phy doesn't support outband + * 1 0 1 0 pcs+phy doesn't support outband + * 0 1 1 0 pcs required, phy doesn't support, invalid + * 1 1 1 0 pcs optional, phy doesn't support, outband + * 0 0 0 1 phy required inband enabled + * 1 0 0 1 pcs doesn't support, phy required, invalid + * 0 1 0 1 pcs+phy required inband enabled + * 1 1 0 1 pcs optional, phy required inband enabled + * 0 0 1 1 phy optional inband enabled + * 1 0 1 1 pcs doesn't support, phy optional, outband + * 0 1 1 1 pcs required, phy optional inband enabled + * 1 1 1 1 pcs+phy optional inband enabled + */ + if ((!pcs_ib_caps || pcs_ib_caps & inband_ok) && + (!phy_ib_caps || phy_ib_caps & inband_ok)) { + /* In-band supported or unknown at both ends. Enable + * in-band mode with or without bypass at the PHY. + */ + if (phy_ib_caps & LINK_INBAND_ENABLE) + pl->phy_ib_mode = LINK_INBAND_ENABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + } else if ((!pcs_ib_caps || pcs_ib_caps & outband_ok) && + (!phy_ib_caps || phy_ib_caps & outband_ok)) { + /* Either in-band not supported at at least one end. + * In-band bypass at the other end is possible. + */ + if (phy_ib_caps & LINK_INBAND_DISABLE) + pl->phy_ib_mode = LINK_INBAND_DISABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + if (pl->phydev) + mode = MLO_AN_PHY; + } else { + /* invalid */ + phylink_warn(pl, "%s: incompatible in-band capabilities, trying in-band", + phy_modes(interface)); + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + } + } else { + /* For Base-X without a PHY */ + if (pcs_ib_caps == LINK_INBAND_DISABLE) + /* If the PCS doesn't support inband, then inband must + * be disabled. + */ + neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; + else if (pcs_ib_caps == LINK_INBAND_ENABLE) + /* If the PCS requires inband, then inband must always + * be enabled. + */ + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, advertising)) neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; else neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; - break; - - default: - neg_mode = PHYLINK_PCS_NEG_NONE; - break; } - return neg_mode; + pl->pcs_neg_mode = neg_mode; + pl->act_link_an_mode = mode; } static void phylink_major_config(struct phylink *pl, bool restart, @@ -1159,11 +1336,9 @@ static void phylink_major_config(struct phylink *pl, bool restart, unsigned int neg_mode; int err; - phylink_dbg(pl, "major config %s\n", phy_modes(state->interface)); - - pl->pcs_neg_mode = phylink_pcs_neg_mode(pl->cur_link_an_mode, - state->interface, - state->advertising); + phylink_dbg(pl, "major config, requested %s/%s\n", + phylink_an_mode_str(pl->req_link_an_mode), + phy_modes(state->interface)); if (pl->mac_ops->mac_select_pcs) { pcs = pl->mac_ops->mac_select_pcs(pl->config, state->interface); @@ -1177,10 +1352,17 @@ static void phylink_major_config(struct phylink *pl, bool restart, pcs_changed = pl->pcs != pcs; } + phylink_pcs_neg_mode(pl, pcs, state->interface, state->advertising); + + phylink_dbg(pl, "major config, active %s/%s/%s\n", + phylink_an_mode_str(pl->act_link_an_mode), + phylink_pcs_mode_str(pl->pcs_neg_mode), + phy_modes(state->interface)); + phylink_pcs_poll_stop(pl); if (pl->mac_ops->mac_prepare) { - err = pl->mac_ops->mac_prepare(pl->config, pl->cur_link_an_mode, + err = pl->mac_ops->mac_prepare(pl->config, pl->act_link_an_mode, state->interface); if (err < 0) { phylink_err(pl, "mac_prepare failed: %pe\n", @@ -1214,7 +1396,7 @@ static void phylink_major_config(struct phylink *pl, bool restart, if (pl->pcs_state == PCS_STATE_STARTING || pcs_changed) phylink_pcs_enable(pl->pcs); - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs && pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; @@ -1230,13 +1412,20 @@ static void phylink_major_config(struct phylink *pl, bool restart, phylink_pcs_an_restart(pl); if (pl->mac_ops->mac_finish) { - err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode, + err = pl->mac_ops->mac_finish(pl->config, pl->act_link_an_mode, state->interface); if (err < 0) phylink_err(pl, "mac_finish failed: %pe\n", ERR_PTR(err)); } + if (pl->phydev && pl->phy_ib_mode) { + err = phy_config_inband(pl->phydev, pl->phy_ib_mode); + if (err < 0) + phylink_err(pl, "phy_config_inband: %pe\n", + ERR_PTR(err)); + } + if (pl->sfp_bus) { rate_kbd = phylink_interface_signal_rate(state->interface); if (rate_kbd) @@ -1261,17 +1450,16 @@ static int phylink_change_inband_advert(struct phylink *pl) return 0; phylink_dbg(pl, "%s: mode=%s/%s adv=%*pb pause=%02x\n", __func__, - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(pl->link_config.interface), __ETHTOOL_LINK_MODE_MASK_NBITS, pl->link_config.advertising, pl->link_config.pause); /* Recompute the PCS neg mode */ - pl->pcs_neg_mode = phylink_pcs_neg_mode(pl->cur_link_an_mode, - pl->link_config.interface, - pl->link_config.advertising); + phylink_pcs_neg_mode(pl, pl->pcs, pl->link_config.interface, + pl->link_config.advertising); - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; @@ -1336,7 +1524,7 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; - switch (pl->cur_link_an_mode) { + switch (pl->req_link_an_mode) { case MLO_AN_PHY: link_state = pl->phy_state; break; @@ -1410,14 +1598,14 @@ static void phylink_link_up(struct phylink *pl, pl->cur_interface = link_state.interface; - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs && pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; phylink_pcs_link_up(pl->pcs, neg_mode, pl->cur_interface, speed, duplex); - pl->mac_ops->mac_link_up(pl->config, pl->phydev, pl->cur_link_an_mode, + pl->mac_ops->mac_link_up(pl->config, pl->phydev, pl->act_link_an_mode, pl->cur_interface, speed, duplex, !!(link_state.pause & MLO_PAUSE_TX), rx_pause); @@ -1437,7 +1625,7 @@ static void phylink_link_down(struct phylink *pl) if (ndev) netif_carrier_off(ndev); - pl->mac_ops->mac_link_down(pl->config, pl->cur_link_an_mode, + pl->mac_ops->mac_link_down(pl->config, pl->act_link_an_mode, pl->cur_interface); phylink_info(pl, "Link is Down\n"); } @@ -1463,10 +1651,10 @@ static void phylink_resolve(struct work_struct *w) } else if (pl->link_failed) { link_state.link = false; retrigger = true; - } else if (pl->cur_link_an_mode == MLO_AN_FIXED) { + } else if (pl->act_link_an_mode == MLO_AN_FIXED) { phylink_get_fixed_state(pl, &link_state); mac_config = link_state.link; - } else if (pl->cur_link_an_mode == MLO_AN_PHY) { + } else if (pl->act_link_an_mode == MLO_AN_PHY) { link_state = pl->phy_state; mac_config = link_state.link; } else { @@ -1520,7 +1708,7 @@ static void phylink_resolve(struct work_struct *w) } } - if (pl->cur_link_an_mode != MLO_AN_FIXED) + if (pl->act_link_an_mode != MLO_AN_FIXED) phylink_apply_manual_flow(pl, &link_state); if (mac_config) { @@ -1644,7 +1832,7 @@ int phylink_set_fixed_link(struct phylink *pl, pl->link_config.an_complete = 1; pl->cfg_link_an_mode = MLO_AN_FIXED; - pl->cur_link_an_mode = pl->cfg_link_an_mode; + pl->req_link_an_mode = pl->cfg_link_an_mode; return 0; } @@ -1732,7 +1920,7 @@ struct phylink *phylink_create(struct phylink_config *config, } } - pl->cur_link_an_mode = pl->cfg_link_an_mode; + pl->req_link_an_mode = pl->cfg_link_an_mode; ret = phylink_register_sfp(pl, fwnode); if (ret < 0) { @@ -2189,7 +2377,7 @@ void phylink_start(struct phylink *pl) ASSERT_RTNL(); phylink_info(pl, "configuring for %s/%s link mode\n", - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(pl->link_config.interface)); /* Always set the carrier off */ @@ -2474,7 +2662,7 @@ int phylink_ethtool_ksettings_get(struct phylink *pl, linkmode_copy(kset->link_modes.supported, pl->supported); - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: /* We are using fixed settings. Report these as the * current link settings - and note that these also @@ -2505,6 +2693,26 @@ int phylink_ethtool_ksettings_get(struct phylink *pl, } EXPORT_SYMBOL_GPL(phylink_ethtool_ksettings_get); +static bool phylink_validate_pcs_inband_autoneg(struct phylink *pl, + phy_interface_t interface, + unsigned long *adv) +{ + unsigned int inband = phylink_inband_caps(pl, interface); + unsigned int mask; + + /* If the PCS doesn't implement inband support, be permissive. */ + if (!inband) + return true; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, adv)) + mask = LINK_INBAND_ENABLE; + else + mask = LINK_INBAND_DISABLE; + + /* Check whether the PCS implements the required mode */ + return !!(inband & mask); +} + /** * phylink_ethtool_ksettings_set() - set the link settings * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -2566,7 +2774,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, /* If we have a fixed link, refuse to change link parameters. * If the link parameters match, accept them but do nothing. */ - if (pl->cur_link_an_mode == MLO_AN_FIXED) { + if (pl->req_link_an_mode == MLO_AN_FIXED) { if (s->speed != pl->link_config.speed || s->duplex != pl->link_config.duplex) return -EINVAL; @@ -2582,7 +2790,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, * is our default case) but do not allow the advertisement to * be changed. If the advertisement matches, simply return. */ - if (pl->cur_link_an_mode == MLO_AN_FIXED) { + if (pl->req_link_an_mode == MLO_AN_FIXED) { if (!linkmode_equal(config.advertising, pl->link_config.advertising)) return -EINVAL; @@ -2617,7 +2825,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, linkmode_copy(support, pl->supported); if (phylink_validate(pl, support, &config)) { phylink_err(pl, "validation of %s/%s with support %*pb failed\n", - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(config.interface), __ETHTOOL_LINK_MODE_MASK_NBITS, support); return -EINVAL; @@ -2635,6 +2843,13 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, phylink_is_empty_linkmode(config.advertising)) return -EINVAL; + /* Validate the autonegotiation state. We don't have a PHY in this + * situation, so the PCS is the media-facing entity. + */ + if (!phylink_validate_pcs_inband_autoneg(pl, config.interface, + config.advertising)) + return -EINVAL; + mutex_lock(&pl->state_mutex); pl->link_config.speed = config.speed; pl->link_config.duplex = config.duplex; @@ -2717,7 +2932,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, ASSERT_RTNL(); - if (pl->cur_link_an_mode == MLO_AN_FIXED) + if (pl->req_link_an_mode == MLO_AN_FIXED) return -EOPNOTSUPP; if (!phylink_test(pl->supported, Pause) && @@ -2981,7 +3196,7 @@ static int phylink_mii_read(struct phylink *pl, unsigned int phy_id, struct phylink_link_state state; int val = 0xffff; - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: if (phy_id == 0) { phylink_get_fixed_state(pl, &state); @@ -3006,7 +3221,7 @@ static int phylink_mii_read(struct phylink *pl, unsigned int phy_id, static int phylink_mii_write(struct phylink *pl, unsigned int phy_id, unsigned int reg, unsigned int val) { - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: break; @@ -3176,10 +3391,11 @@ static phy_interface_t phylink_choose_sfp_interface(struct phylink *pl, return interface; } -static void phylink_sfp_set_config(struct phylink *pl, u8 mode, +static void phylink_sfp_set_config(struct phylink *pl, unsigned long *supported, struct phylink_link_state *state) { + u8 mode = MLO_AN_INBAND; bool changed = false; phylink_dbg(pl, "requesting link mode %s/%s with support %*pb\n", @@ -3196,9 +3412,9 @@ static void phylink_sfp_set_config(struct phylink *pl, u8 mode, changed = true; } - if (pl->cur_link_an_mode != mode || + if (pl->req_link_an_mode != mode || pl->link_config.interface != state->interface) { - pl->cur_link_an_mode = mode; + pl->req_link_an_mode = mode; pl->link_config.interface = state->interface; changed = true; @@ -3213,8 +3429,7 @@ static void phylink_sfp_set_config(struct phylink *pl, u8 mode, phylink_mac_initial_config(pl, false); } -static int phylink_sfp_config_phy(struct phylink *pl, u8 mode, - struct phy_device *phy) +static int phylink_sfp_config_phy(struct phylink *pl, struct phy_device *phy) { __ETHTOOL_DECLARE_LINK_MODE_MASK(support); struct phylink_link_state config; @@ -3258,7 +3473,7 @@ static int phylink_sfp_config_phy(struct phylink *pl, u8 mode, pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, mode, support, &config); + phylink_sfp_set_config(pl, support, &config); return 0; } @@ -3314,6 +3529,12 @@ static int phylink_sfp_config_optical(struct phylink *pl) phylink_dbg(pl, "optical SFP: chosen %s interface\n", phy_modes(interface)); + if (!phylink_validate_pcs_inband_autoneg(pl, interface, + config.advertising)) { + phylink_err(pl, "autoneg setting not compatible with PCS"); + return -EINVAL; + } + config.interface = interface; /* Ignore errors if we're expecting a PHY to attach later */ @@ -3327,7 +3548,7 @@ static int phylink_sfp_config_optical(struct phylink *pl) pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, MLO_AN_INBAND, pl->sfp_support, &config); + phylink_sfp_set_config(pl, pl->sfp_support, &config); return 0; } @@ -3398,19 +3619,16 @@ static void phylink_sfp_link_up(void *upstream) phylink_enable_and_run_resolve(pl, PHYLINK_DISABLE_LINK); } -/* The Broadcom BCM84881 in the Methode DM7052 is unable to provide a SGMII - * or 802.3z control word, so inband will not work. - */ -static bool phylink_phy_no_inband(struct phy_device *phy) -{ - return phy->is_c45 && phy_id_compare(phy->c45_ids.device_ids[1], - 0xae025150, 0xfffffff0); -} - static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) { struct phylink *pl = upstream; - u8 mode; + + if (!phy->drv) { + phylink_err(pl, "PHY %s (id 0x%.8lx) has no driver loaded\n", + phydev_name(phy), (unsigned long)phy->phy_id); + phylink_err(pl, "Drivers which handle known common cases: CONFIG_BCM84881_PHY, CONFIG_MARVELL_PHY\n"); + return -EINVAL; + } /* * This is the new way of dealing with flow control for PHYs, @@ -3421,17 +3639,12 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) */ phy_support_asym_pause(phy); - if (phylink_phy_no_inband(phy)) - mode = MLO_AN_PHY; - else - mode = MLO_AN_INBAND; - /* Set the PHY's host supported interfaces */ phy_interface_and(phy->host_interfaces, phylink_sfp_interfaces, pl->config->supported_interfaces); /* Do the initial configuration */ - return phylink_sfp_config_phy(pl, mode, phy); + return phylink_sfp_config_phy(pl, phy); } static void phylink_sfp_disconnect_phy(void *upstream, diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c index 7196e927c2cd..076a370be849 100644 --- a/drivers/net/phy/spi_ks8995.c +++ b/drivers/net/phy/spi_ks8995.c @@ -289,7 +289,7 @@ static int ks8995_reset(struct ks8995_switch *ks) } static ssize_t ks8995_registers_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev; struct ks8995_switch *ks8995; @@ -301,7 +301,7 @@ static ssize_t ks8995_registers_read(struct file *filp, struct kobject *kobj, } static ssize_t ks8995_registers_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev; struct ks8995_switch *ks8995; @@ -401,8 +401,8 @@ static const struct bin_attribute ks8995_registers_attr = { .mode = 0600, }, .size = KS8995_REGS_SIZE, - .read = ks8995_registers_read, - .write = ks8995_registers_write, + .read_new = ks8995_registers_read, + .write_new = ks8995_registers_write, }; /* ------------------------------------------------------------------------ */ diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 5aa41d5f7765..5ca6ecf0ce5f 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1329,9 +1329,9 @@ int tap_queue_resize(struct tap_dev *tap) list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; - ret = ptr_ring_resize_multiple(rings, n, - dev->tx_queue_len, GFP_KERNEL, - __skb_array_destroy_skb); + ret = ptr_ring_resize_multiple_bh(rings, n, + dev->tx_queue_len, GFP_KERNEL, + __skb_array_destroy_skb); kfree(rings); return ret; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e816aaba8e5f..28624cca91f8 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -574,14 +574,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, return ret; } -static inline bool tun_not_capable(struct tun_struct *tun) +static inline bool tun_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); - return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || - (gid_valid(tun->group) && !in_egroup_p(tun->group))) && - !ns_capable(net->user_ns, CAP_NET_ADMIN); + if (ns_capable(net->user_ns, CAP_NET_ADMIN)) + return 1; + if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner)) + return 1; + if (gid_valid(tun->group) && in_egroup_p(tun->group)) + return 1; + return 0; } static void tun_set_real_num_queues(struct tun_struct *tun) @@ -2778,7 +2782,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; - if (tun_not_capable(tun)) + if (!tun_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) @@ -3697,9 +3701,9 @@ static int tun_queue_resize(struct tun_struct *tun) list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; - ret = ptr_ring_resize_multiple(rings, n, - dev->tx_queue_len, GFP_KERNEL, - tun_ptr_free); + ret = ptr_ring_resize_multiple_bh(rings, n, + dev->tx_queue_len, GFP_KERNEL, + tun_ptr_free); kfree(rings); return ret; diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 531b1b6a37d1..a91bf9c7e31d 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -439,7 +439,7 @@ struct lan78xx_net { struct usb_anchor deferred; struct mutex dev_mutex; /* serialise open/stop wrt suspend/resume */ - struct mutex phy_mutex; /* for phy access */ + struct mutex mdiobus_mutex; /* for MDIO bus access */ unsigned int pipe_in, pipe_out, pipe_intr; unsigned int bulk_in_delay; @@ -472,10 +472,6 @@ struct lan78xx_net { struct irq_domain_data domain_data; }; -/* define external phy id */ -#define PHY_LAN8835 (0x0007C130) -#define PHY_KSZ9031RNX (0x00221620) - /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param(msg_level, int, 0); @@ -625,8 +621,8 @@ static int lan78xx_read_reg(struct lan78xx_net *dev, u32 index, u32 *data) *data = *buf; } else if (net_ratelimit()) { netdev_warn(dev->net, - "Failed to read register index 0x%08x. ret = %d", - index, ret); + "Failed to read register index 0x%08x. ret = %pe", + index, ERR_PTR(ret)); } kfree(buf); @@ -656,8 +652,8 @@ static int lan78xx_write_reg(struct lan78xx_net *dev, u32 index, u32 data) if (unlikely(ret < 0) && net_ratelimit()) { netdev_warn(dev->net, - "Failed to write register index 0x%08x. ret = %d", - index, ret); + "Failed to write register index 0x%08x. ret = %pe", + index, ERR_PTR(ret)); } kfree(buf); @@ -678,11 +674,7 @@ static int lan78xx_update_reg(struct lan78xx_net *dev, u32 reg, u32 mask, buf &= ~mask; buf |= (mask & data); - ret = lan78xx_write_reg(dev, reg, buf); - if (ret < 0) - return ret; - - return 0; + return lan78xx_write_reg(dev, reg, buf); } static int lan78xx_read_stats(struct lan78xx_net *dev, @@ -812,8 +804,156 @@ static void lan78xx_update_stats(struct lan78xx_net *dev) usb_autopm_put_interface(dev->intf); } -/* Loop until the read is completed with timeout called with phy_mutex held */ -static int lan78xx_phy_wait_not_busy(struct lan78xx_net *dev) +static int lan78xx_start_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enable) +{ + return lan78xx_update_reg(dev, reg, hw_enable, hw_enable); +} + +static int lan78xx_stop_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enabled, + u32 hw_disabled) +{ + unsigned long timeout; + bool stopped = true; + int ret; + u32 buf; + + /* Stop the h/w block (if not already stopped) */ + + ret = lan78xx_read_reg(dev, reg, &buf); + if (ret < 0) + return ret; + + if (buf & hw_enabled) { + buf &= ~hw_enabled; + + ret = lan78xx_write_reg(dev, reg, buf); + if (ret < 0) + return ret; + + stopped = false; + timeout = jiffies + HW_DISABLE_TIMEOUT; + do { + ret = lan78xx_read_reg(dev, reg, &buf); + if (ret < 0) + return ret; + + if (buf & hw_disabled) + stopped = true; + else + msleep(HW_DISABLE_DELAY_MS); + } while (!stopped && !time_after(jiffies, timeout)); + } + + return stopped ? 0 : -ETIMEDOUT; +} + +static int lan78xx_flush_fifo(struct lan78xx_net *dev, u32 reg, u32 fifo_flush) +{ + return lan78xx_update_reg(dev, reg, fifo_flush, fifo_flush); +} + +static int lan78xx_start_tx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "start tx path"); + + /* Start the MAC transmitter */ + + ret = lan78xx_start_hw(dev, MAC_TX, MAC_TX_TXEN_); + if (ret < 0) + return ret; + + /* Start the Tx FIFO */ + + ret = lan78xx_start_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_); + if (ret < 0) + return ret; + + return 0; +} + +static int lan78xx_stop_tx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "stop tx path"); + + /* Stop the Tx FIFO */ + + ret = lan78xx_stop_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_, FCT_TX_CTL_DIS_); + if (ret < 0) + return ret; + + /* Stop the MAC transmitter */ + + ret = lan78xx_stop_hw(dev, MAC_TX, MAC_TX_TXEN_, MAC_TX_TXD_); + if (ret < 0) + return ret; + + return 0; +} + +/* The caller must ensure the Tx path is stopped before calling + * lan78xx_flush_tx_fifo(). + */ +static int lan78xx_flush_tx_fifo(struct lan78xx_net *dev) +{ + return lan78xx_flush_fifo(dev, FCT_TX_CTL, FCT_TX_CTL_RST_); +} + +static int lan78xx_start_rx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "start rx path"); + + /* Start the Rx FIFO */ + + ret = lan78xx_start_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_); + if (ret < 0) + return ret; + + /* Start the MAC receiver*/ + + ret = lan78xx_start_hw(dev, MAC_RX, MAC_RX_RXEN_); + if (ret < 0) + return ret; + + return 0; +} + +static int lan78xx_stop_rx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "stop rx path"); + + /* Stop the MAC receiver */ + + ret = lan78xx_stop_hw(dev, MAC_RX, MAC_RX_RXEN_, MAC_RX_RXD_); + if (ret < 0) + return ret; + + /* Stop the Rx FIFO */ + + ret = lan78xx_stop_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_, FCT_RX_CTL_DIS_); + if (ret < 0) + return ret; + + return 0; +} + +/* The caller must ensure the Rx path is stopped before calling + * lan78xx_flush_rx_fifo(). + */ +static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) +{ + return lan78xx_flush_fifo(dev, FCT_RX_CTL, FCT_RX_CTL_RST_); +} + +/* Loop until the read is completed with timeout called with mdiobus_mutex held */ +static int lan78xx_mdiobus_wait_not_busy(struct lan78xx_net *dev) { unsigned long start_time = jiffies; u32 val; @@ -821,14 +961,14 @@ static int lan78xx_phy_wait_not_busy(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, MII_ACC, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & MII_ACC_MII_BUSY_)) return 0; } while (!time_after(jiffies, start_time + HZ)); - return -EIO; + return -ETIMEDOUT; } static inline u32 mii_access(int id, int index, int read) @@ -854,8 +994,8 @@ static int lan78xx_wait_eeprom(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, E2P_CMD, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & E2P_CMD_EPC_BUSY_) || (val & E2P_CMD_EPC_TIMEOUT_)) @@ -865,7 +1005,7 @@ static int lan78xx_wait_eeprom(struct lan78xx_net *dev) if (val & (E2P_CMD_EPC_TIMEOUT_ | E2P_CMD_EPC_BUSY_)) { netdev_warn(dev->net, "EEPROM read operation timeout"); - return -EIO; + return -ETIMEDOUT; } return 0; @@ -879,8 +1019,8 @@ static int lan78xx_eeprom_confirm_not_busy(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, E2P_CMD, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & E2P_CMD_EPC_BUSY_)) return 0; @@ -889,75 +1029,81 @@ static int lan78xx_eeprom_confirm_not_busy(struct lan78xx_net *dev) } while (!time_after(jiffies, start_time + HZ)); netdev_warn(dev->net, "EEPROM is busy"); - return -EIO; + return -ETIMEDOUT; } static int lan78xx_read_raw_eeprom(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - u32 val; - u32 saved; + u32 val, saved; int i, ret; - int retval; /* depends on chip, some EEPROM pins are muxed with LED function. * disable & restore LED function to access EEPROM. */ ret = lan78xx_read_reg(dev, HW_CFG, &val); + if (ret < 0) + return ret; + saved = val; if (dev->chipid == ID_REV_CHIP_ID_7800_) { val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_); ret = lan78xx_write_reg(dev, HW_CFG, val); + if (ret < 0) + return ret; } - retval = lan78xx_eeprom_confirm_not_busy(dev); - if (retval) - return retval; + ret = lan78xx_eeprom_confirm_not_busy(dev); + if (ret == -ETIMEDOUT) + goto read_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; for (i = 0; i < length; i++) { val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_READ_; val |= (offset & E2P_CMD_EPC_ADDR_MASK_); ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto read_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; ret = lan78xx_read_reg(dev, E2P_DATA, &val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; data[i] = val & 0xFF; offset++; } - retval = 0; -exit: +read_raw_eeprom_done: if (dev->chipid == ID_REV_CHIP_ID_7800_) - ret = lan78xx_write_reg(dev, HW_CFG, saved); + return lan78xx_write_reg(dev, HW_CFG, saved); - return retval; + return 0; } static int lan78xx_read_eeprom(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - u8 sig; int ret; + u8 sig; ret = lan78xx_read_raw_eeprom(dev, 0, 1, &sig); - if ((ret == 0) && (sig == EEPROM_INDICATOR)) - ret = lan78xx_read_raw_eeprom(dev, offset, length, data); - else - ret = -EINVAL; + if (ret < 0) + return ret; - return ret; + if (sig != EEPROM_INDICATOR) + return -ENODATA; + + return lan78xx_read_raw_eeprom(dev, offset, length, data); } static int lan78xx_write_raw_eeprom(struct lan78xx_net *dev, u32 offset, @@ -966,113 +1112,144 @@ static int lan78xx_write_raw_eeprom(struct lan78xx_net *dev, u32 offset, u32 val; u32 saved; int i, ret; - int retval; /* depends on chip, some EEPROM pins are muxed with LED function. * disable & restore LED function to access EEPROM. */ ret = lan78xx_read_reg(dev, HW_CFG, &val); + if (ret < 0) + return ret; + saved = val; if (dev->chipid == ID_REV_CHIP_ID_7800_) { val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_); ret = lan78xx_write_reg(dev, HW_CFG, val); + if (ret < 0) + return ret; } - retval = lan78xx_eeprom_confirm_not_busy(dev); - if (retval) - goto exit; + ret = lan78xx_eeprom_confirm_not_busy(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; /* Issue write/erase enable command */ val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_EWEN_; ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; for (i = 0; i < length; i++) { /* Fill data register */ val = data[i]; ret = lan78xx_write_reg(dev, E2P_DATA, val); - if (ret < 0) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; /* Send "write" command */ val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_WRITE_; val |= (offset & E2P_CMD_EPC_ADDR_MASK_); ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (ret < 0) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; offset++; } - retval = 0; -exit: +write_raw_eeprom_done: if (dev->chipid == ID_REV_CHIP_ID_7800_) - ret = lan78xx_write_reg(dev, HW_CFG, saved); + return lan78xx_write_reg(dev, HW_CFG, saved); - return retval; + return 0; } static int lan78xx_read_raw_otp(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - int i; - u32 buf; unsigned long timeout; + int ret, i; + u32 buf; - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; if (buf & OTP_PWR_DN_PWRDN_N_) { /* clear it and wait to be cleared */ - lan78xx_write_reg(dev, OTP_PWR_DN, 0); + ret = lan78xx_write_reg(dev, OTP_PWR_DN, 0); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { usleep_range(1, 10); - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_PWR_DN"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_PWR_DN_PWRDN_N_); } for (i = 0; i < length; i++) { - lan78xx_write_reg(dev, OTP_ADDR1, - ((offset + i) >> 8) & OTP_ADDR1_15_11); - lan78xx_write_reg(dev, OTP_ADDR2, - ((offset + i) & OTP_ADDR2_10_3)); + ret = lan78xx_write_reg(dev, OTP_ADDR1, + ((offset + i) >> 8) & OTP_ADDR1_15_11); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_ADDR2, + ((offset + i) & OTP_ADDR2_10_3)); + if (ret < 0) + return ret; - lan78xx_write_reg(dev, OTP_FUNC_CMD, OTP_FUNC_CMD_READ_); - lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + ret = lan78xx_write_reg(dev, OTP_FUNC_CMD, OTP_FUNC_CMD_READ_); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_STATUS, &buf); + ret = lan78xx_read_reg(dev, OTP_STATUS, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_STATUS"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_STATUS_BUSY_); - lan78xx_read_reg(dev, OTP_RD_DATA, &buf); + ret = lan78xx_read_reg(dev, OTP_RD_DATA, &buf); + if (ret < 0) + return ret; data[i] = (u8)(buf & 0xFF); } @@ -1086,45 +1263,72 @@ static int lan78xx_write_raw_otp(struct lan78xx_net *dev, u32 offset, int i; u32 buf; unsigned long timeout; + int ret; - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; if (buf & OTP_PWR_DN_PWRDN_N_) { /* clear it and wait to be cleared */ - lan78xx_write_reg(dev, OTP_PWR_DN, 0); + ret = lan78xx_write_reg(dev, OTP_PWR_DN, 0); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_PWR_DN completion"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_PWR_DN_PWRDN_N_); } /* set to BYTE program mode */ - lan78xx_write_reg(dev, OTP_PRGM_MODE, OTP_PRGM_MODE_BYTE_); + ret = lan78xx_write_reg(dev, OTP_PRGM_MODE, OTP_PRGM_MODE_BYTE_); + if (ret < 0) + return ret; for (i = 0; i < length; i++) { - lan78xx_write_reg(dev, OTP_ADDR1, - ((offset + i) >> 8) & OTP_ADDR1_15_11); - lan78xx_write_reg(dev, OTP_ADDR2, - ((offset + i) & OTP_ADDR2_10_3)); - lan78xx_write_reg(dev, OTP_PRGM_DATA, data[i]); - lan78xx_write_reg(dev, OTP_TST_CMD, OTP_TST_CMD_PRGVRFY_); - lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + ret = lan78xx_write_reg(dev, OTP_ADDR1, + ((offset + i) >> 8) & OTP_ADDR1_15_11); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_ADDR2, + ((offset + i) & OTP_ADDR2_10_3)); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_PRGM_DATA, data[i]); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_TST_CMD, OTP_TST_CMD_PRGVRFY_); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_STATUS, &buf); + ret = lan78xx_read_reg(dev, OTP_STATUS, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "Timeout on OTP_STATUS completion"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_STATUS_BUSY_); } @@ -1161,7 +1365,7 @@ static int lan78xx_dataport_wait_not_busy(struct lan78xx_net *dev) ret = lan78xx_read_reg(dev, DP_SEL, &dp_sel); if (unlikely(ret < 0)) - return -EIO; + return ret; if (dp_sel & DP_SEL_DPRDY_) return 0; @@ -1171,44 +1375,51 @@ static int lan78xx_dataport_wait_not_busy(struct lan78xx_net *dev) netdev_warn(dev->net, "%s timed out", __func__); - return -EIO; + return -ETIMEDOUT; } static int lan78xx_dataport_write(struct lan78xx_net *dev, u32 ram_select, u32 addr, u32 length, u32 *buf) { struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]); - u32 dp_sel; int i, ret; - if (usb_autopm_get_interface(dev->intf) < 0) - return 0; + ret = usb_autopm_get_interface(dev->intf); + if (ret < 0) + return ret; mutex_lock(&pdata->dataport_mutex); ret = lan78xx_dataport_wait_not_busy(dev); if (ret < 0) - goto done; + goto dataport_write; - ret = lan78xx_read_reg(dev, DP_SEL, &dp_sel); - - dp_sel &= ~DP_SEL_RSEL_MASK_; - dp_sel |= ram_select; - ret = lan78xx_write_reg(dev, DP_SEL, dp_sel); + ret = lan78xx_update_reg(dev, DP_SEL, DP_SEL_RSEL_MASK_, ram_select); + if (ret < 0) + goto dataport_write; for (i = 0; i < length; i++) { ret = lan78xx_write_reg(dev, DP_ADDR, addr + i); + if (ret < 0) + goto dataport_write; ret = lan78xx_write_reg(dev, DP_DATA, buf[i]); + if (ret < 0) + goto dataport_write; ret = lan78xx_write_reg(dev, DP_CMD, DP_CMD_WRITE_); + if (ret < 0) + goto dataport_write; ret = lan78xx_dataport_wait_not_busy(dev); if (ret < 0) - goto done; + goto dataport_write; } -done: +dataport_write: + if (ret < 0) + netdev_warn(dev->net, "dataport write failed %pe", ERR_PTR(ret)); + mutex_unlock(&pdata->dataport_mutex); usb_autopm_put_interface(dev->intf); @@ -1244,23 +1455,39 @@ static void lan78xx_deferred_multicast_write(struct work_struct *param) struct lan78xx_priv *pdata = container_of(param, struct lan78xx_priv, set_multicast); struct lan78xx_net *dev = pdata->dev; - int i; + int i, ret; netif_dbg(dev, drv, dev->net, "deferred multicast write 0x%08x\n", pdata->rfe_ctl); - lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, DP_SEL_VHF_VLAN_LEN, - DP_SEL_VHF_HASH_LEN, pdata->mchash_table); + ret = lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, + DP_SEL_VHF_VLAN_LEN, + DP_SEL_VHF_HASH_LEN, pdata->mchash_table); + if (ret < 0) + goto multicast_write_done; for (i = 1; i < NUM_OF_MAF; i++) { - lan78xx_write_reg(dev, MAF_HI(i), 0); - lan78xx_write_reg(dev, MAF_LO(i), - pdata->pfilter_table[i][1]); - lan78xx_write_reg(dev, MAF_HI(i), - pdata->pfilter_table[i][0]); + ret = lan78xx_write_reg(dev, MAF_HI(i), 0); + if (ret < 0) + goto multicast_write_done; + + ret = lan78xx_write_reg(dev, MAF_LO(i), + pdata->pfilter_table[i][1]); + if (ret < 0) + goto multicast_write_done; + + ret = lan78xx_write_reg(dev, MAF_HI(i), + pdata->pfilter_table[i][0]); + if (ret < 0) + goto multicast_write_done; } - lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); + ret = lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); + +multicast_write_done: + if (ret < 0) + netdev_warn(dev->net, "multicast write failed %pe", ERR_PTR(ret)); + return; } static void lan78xx_set_multicast(struct net_device *netdev) @@ -1369,24 +1596,24 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) u32 val; int ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* Resetting the device while there is activity on the MDIO * bus can result in the MAC interface locking up and not * completing register access transactions. */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) - goto done; + goto exit_unlock; ret = lan78xx_read_reg(dev, MAC_CR, &val); if (ret < 0) - goto done; + goto exit_unlock; val |= MAC_CR_RST_; ret = lan78xx_write_reg(dev, MAC_CR, val); if (ret < 0) - goto done; + goto exit_unlock; /* Wait for the reset to complete before allowing any further * MAC register accesses otherwise the MAC may lock up. @@ -1394,17 +1621,17 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, MAC_CR, &val); if (ret < 0) - goto done; + goto exit_unlock; if (!(val & MAC_CR_RST_)) { ret = 0; - goto done; + goto exit_unlock; } } while (!time_after(jiffies, start_time + HZ)); ret = -ETIMEDOUT; -done: - mutex_unlock(&dev->phy_mutex); +exit_unlock: + mutex_unlock(&dev->mdiobus_mutex); return ret; } @@ -1630,6 +1857,7 @@ static void lan78xx_get_wol(struct net_device *netdev, ret = lan78xx_read_reg(dev, USB_CFG0, &buf); if (unlikely(ret < 0)) { + netdev_warn(dev->net, "failed to get WoL %pe", ERR_PTR(ret)); wol->supported = 0; wol->wolopts = 0; } else { @@ -1661,10 +1889,13 @@ static int lan78xx_set_wol(struct net_device *netdev, pdata->wol = wol->wolopts; - device_set_wakeup_enable(&dev->udev->dev, (bool)wol->wolopts); + ret = device_set_wakeup_enable(&dev->udev->dev, (bool)wol->wolopts); + if (ret < 0) + goto exit_pm_put; - phy_ethtool_set_wol(netdev->phydev, wol); + ret = phy_ethtool_set_wol(netdev->phydev, wol); +exit_pm_put: usb_autopm_put_interface(dev->intf); return ret; @@ -1869,30 +2100,35 @@ exit: static int lan78xx_get_regs_len(struct net_device *netdev) { - if (!netdev->phydev) - return (sizeof(lan78xx_regs)); - else - return (sizeof(lan78xx_regs) + PHY_REG_SIZE); + return sizeof(lan78xx_regs); } static void lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *buf) { - u32 *data = buf; - int i, j; struct lan78xx_net *dev = netdev_priv(netdev); + unsigned int data_count = 0; + u32 *data = buf; + int i, ret; /* Read Device/MAC registers */ - for (i = 0; i < ARRAY_SIZE(lan78xx_regs); i++) - lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]); + for (i = 0; i < ARRAY_SIZE(lan78xx_regs); i++) { + ret = lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]); + if (ret < 0) { + netdev_warn(dev->net, + "failed to read register 0x%08x\n", + lan78xx_regs[i]); + goto clean_data; + } - if (!netdev->phydev) - return; + data_count++; + } - /* Read PHY registers */ - for (j = 0; j < 32; i++, j++) - data[i] = phy_read(netdev->phydev, j); + return; + +clean_data: + memset(data, 0, data_count * sizeof(u32)); } static const struct ethtool_ops lan78xx_ethtool_ops = { @@ -1920,13 +2156,19 @@ static const struct ethtool_ops lan78xx_ethtool_ops = { .get_regs = lan78xx_get_regs, }; -static void lan78xx_init_mac_address(struct lan78xx_net *dev) +static int lan78xx_init_mac_address(struct lan78xx_net *dev) { u32 addr_lo, addr_hi; u8 addr[6]; + int ret; - lan78xx_read_reg(dev, RX_ADDRL, &addr_lo); - lan78xx_read_reg(dev, RX_ADDRH, &addr_hi); + ret = lan78xx_read_reg(dev, RX_ADDRL, &addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_read_reg(dev, RX_ADDRH, &addr_hi); + if (ret < 0) + return ret; addr[0] = addr_lo & 0xFF; addr[1] = (addr_lo >> 8) & 0xFF; @@ -1959,14 +2201,26 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev) (addr[2] << 16) | (addr[3] << 24); addr_hi = addr[4] | (addr[5] << 8); - lan78xx_write_reg(dev, RX_ADDRL, addr_lo); - lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + if (ret < 0) + return ret; } - lan78xx_write_reg(dev, MAF_LO(0), addr_lo); - lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + if (ret < 0) + return ret; eth_hw_addr_set(dev->net, addr); + + return 0; } /* MDIO read and write wrappers for phylib */ @@ -1980,27 +2234,31 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) if (ret < 0) return ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* confirm MII not busy */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; /* set the address, index & direction (read from PHY) */ addr = mii_access(phy_id, idx, MII_READ); ret = lan78xx_write_reg(dev, MII_ACC, addr); + if (ret < 0) + goto done; - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; ret = lan78xx_read_reg(dev, MII_DATA, &val); + if (ret < 0) + goto done; ret = (int)(val & 0xFFFF); done: - mutex_unlock(&dev->phy_mutex); + mutex_unlock(&dev->mdiobus_mutex); usb_autopm_put_interface(dev->intf); return ret; @@ -2017,28 +2275,32 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, if (ret < 0) return ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* confirm MII not busy */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; val = (u32)regval; ret = lan78xx_write_reg(dev, MII_DATA, val); + if (ret < 0) + goto done; /* set the address, index & direction (write to PHY) */ addr = mii_access(phy_id, idx, MII_WRITE); ret = lan78xx_write_reg(dev, MII_ACC, addr); + if (ret < 0) + goto done; - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; done: - mutex_unlock(&dev->phy_mutex); + mutex_unlock(&dev->mdiobus_mutex); usb_autopm_put_interface(dev->intf); - return 0; + return ret; } static int lan78xx_mdio_init(struct lan78xx_net *dev) @@ -2164,13 +2426,22 @@ static void lan78xx_irq_bus_sync_unlock(struct irq_data *irqd) struct lan78xx_net *dev = container_of(data, struct lan78xx_net, domain_data); u32 buf; + int ret; /* call register access here because irq_bus_lock & irq_bus_sync_unlock * are only two callbacks executed in non-atomic contex. */ - lan78xx_read_reg(dev, INT_EP_CTL, &buf); + ret = lan78xx_read_reg(dev, INT_EP_CTL, &buf); + if (ret < 0) + goto irq_bus_sync_unlock; + if (buf != data->irqenable) - lan78xx_write_reg(dev, INT_EP_CTL, data->irqenable); + ret = lan78xx_write_reg(dev, INT_EP_CTL, data->irqenable); + +irq_bus_sync_unlock: + if (ret < 0) + netdev_err(dev->net, "Failed to sync IRQ enable register: %pe\n", + ERR_PTR(ret)); mutex_unlock(&data->irq_lock); } @@ -2195,7 +2466,10 @@ static int lan78xx_setup_irq_domain(struct lan78xx_net *dev) mutex_init(&dev->domain_data.irq_lock); - lan78xx_read_reg(dev, INT_EP_CTL, &buf); + ret = lan78xx_read_reg(dev, INT_EP_CTL, &buf); + if (ret < 0) + return ret; + dev->domain_data.irqenable = buf; dev->domain_data.irqchip = &lan78xx_irqchip; @@ -2234,46 +2508,6 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } -static int lan8835_fixup(struct phy_device *phydev) -{ - int buf; - struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); - - /* LED2/PME_N/IRQ_N/RGMII_ID pin to IRQ_N mode */ - buf = phy_read_mmd(phydev, MDIO_MMD_PCS, 0x8010); - buf &= ~0x1800; - buf |= 0x0800; - phy_write_mmd(phydev, MDIO_MMD_PCS, 0x8010, buf); - - /* RGMII MAC TXC Delay Enable */ - lan78xx_write_reg(dev, MAC_RGMII_ID, - MAC_RGMII_ID_TXC_DELAY_EN_); - - /* RGMII TX DLL Tune Adjust */ - lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00); - - dev->interface = PHY_INTERFACE_MODE_RGMII_TXID; - - return 1; -} - -static int ksz9031rnx_fixup(struct phy_device *phydev) -{ - struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); - - /* Micrel9301RNX PHY configuration */ - /* RGMII Control Signal Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 4, 0x0077); - /* RGMII RX Data Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 5, 0x7777); - /* RGMII RX Clock Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 8, 0x1FF); - - dev->interface = PHY_INTERFACE_MODE_RGMII_RXID; - - return 1; -} - static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) { u32 buf; @@ -2307,22 +2541,11 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) netdev_err(dev->net, "no PHY driver found\n"); return NULL; } - dev->interface = PHY_INTERFACE_MODE_RGMII; - /* external PHY fixup for KSZ9031RNX */ - ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0, - ksz9031rnx_fixup); - if (ret < 0) { - netdev_err(dev->net, "Failed to register fixup for PHY_KSZ9031RNX\n"); - return NULL; - } - /* external PHY fixup for LAN8835 */ - ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0, - lan8835_fixup); - if (ret < 0) { - netdev_err(dev->net, "Failed to register fixup for PHY_LAN8835\n"); - return NULL; - } - /* add more external PHY fixup here if needed */ + dev->interface = PHY_INTERFACE_MODE_RGMII_ID; + /* The PHY driver is responsible to configure proper RGMII + * interface delays. Disable RGMII delays on MAC side. + */ + lan78xx_write_reg(dev, MAC_RGMII_ID, 0); phydev->is_internal = false; } @@ -2381,11 +2604,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) if (phy_is_pseudo_fixed_link(phydev)) { fixed_phy_unregister(phydev); phy_device_free(phydev); - } else { - phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, - 0xfffffff0); - phy_unregister_fixup_for_uid(PHY_LAN8835, - 0xfffffff0); } } return -EIO; @@ -2437,27 +2655,36 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size) { - u32 buf; bool rxenabled; + u32 buf; + int ret; - lan78xx_read_reg(dev, MAC_RX, &buf); + ret = lan78xx_read_reg(dev, MAC_RX, &buf); + if (ret < 0) + return ret; rxenabled = ((buf & MAC_RX_RXEN_) != 0); if (rxenabled) { buf &= ~MAC_RX_RXEN_; - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; } /* add 4 to size for FCS */ buf &= ~MAC_RX_MAX_SIZE_MASK_; buf |= (((size + 4) << MAC_RX_MAX_SIZE_SHIFT_) & MAC_RX_MAX_SIZE_MASK_); - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; if (rxenabled) { buf |= MAC_RX_RXEN_; - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; } return 0; @@ -2523,7 +2750,10 @@ static int lan78xx_change_mtu(struct net_device *netdev, int new_mtu) return ret; ret = lan78xx_set_rx_max_frame_length(dev, max_frame_len); - if (!ret) + if (ret < 0) + netdev_err(dev->net, "MTU changed to %d from %d failed with %pe\n", + new_mtu, netdev->mtu, ERR_PTR(ret)); + else WRITE_ONCE(netdev->mtu, new_mtu); usb_autopm_put_interface(dev->intf); @@ -2536,6 +2766,7 @@ static int lan78xx_set_mac_addr(struct net_device *netdev, void *p) struct lan78xx_net *dev = netdev_priv(netdev); struct sockaddr *addr = p; u32 addr_lo, addr_hi; + int ret; if (netif_running(netdev)) return -EBUSY; @@ -2552,14 +2783,20 @@ static int lan78xx_set_mac_addr(struct net_device *netdev, void *p) addr_hi = netdev->dev_addr[4] | netdev->dev_addr[5] << 8; - lan78xx_write_reg(dev, RX_ADDRL, addr_lo); - lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + if (ret < 0) + return ret; /* Added to support MAC address changes */ - lan78xx_write_reg(dev, MAF_LO(0), addr_lo); - lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo); + if (ret < 0) + return ret; - return 0; + return lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); } /* Enable or disable Rx checksum offload engine */ @@ -2592,9 +2829,7 @@ static int lan78xx_set_features(struct net_device *netdev, spin_unlock_irqrestore(&pdata->rfe_ctl_lock, flags); - lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); - - return 0; + return lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); } static void lan78xx_deferred_vlan_write(struct work_struct *param) @@ -2645,13 +2880,16 @@ static int lan78xx_vlan_rx_kill_vid(struct net_device *netdev, return 0; } -static void lan78xx_init_ltm(struct lan78xx_net *dev) +static int lan78xx_init_ltm(struct lan78xx_net *dev) { + u32 regs[6] = { 0 }; int ret; u32 buf; - u32 regs[6] = { 0 }; ret = lan78xx_read_reg(dev, USB_CFG1, &buf); + if (ret < 0) + goto init_ltm_failed; + if (buf & USB_CFG1_LTM_ENABLE_) { u8 temp[2]; /* Get values from EEPROM first */ @@ -2662,7 +2900,7 @@ static void lan78xx_init_ltm(struct lan78xx_net *dev) 24, (u8 *)regs); if (ret < 0) - return; + return ret; } } else if (lan78xx_read_otp(dev, 0x3F, 2, temp) == 0) { if (temp[0] == 24) { @@ -2671,17 +2909,40 @@ static void lan78xx_init_ltm(struct lan78xx_net *dev) 24, (u8 *)regs); if (ret < 0) - return; + return ret; } } } - lan78xx_write_reg(dev, LTM_BELT_IDLE0, regs[0]); - lan78xx_write_reg(dev, LTM_BELT_IDLE1, regs[1]); - lan78xx_write_reg(dev, LTM_BELT_ACT0, regs[2]); - lan78xx_write_reg(dev, LTM_BELT_ACT1, regs[3]); - lan78xx_write_reg(dev, LTM_INACTIVE0, regs[4]); - lan78xx_write_reg(dev, LTM_INACTIVE1, regs[5]); + ret = lan78xx_write_reg(dev, LTM_BELT_IDLE0, regs[0]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_IDLE1, regs[1]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_ACT0, regs[2]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_ACT1, regs[3]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_INACTIVE0, regs[4]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_INACTIVE1, regs[5]); + if (ret < 0) + goto init_ltm_failed; + + return 0; + +init_ltm_failed: + netdev_err(dev->net, "Failed to init LTM with error %pe\n", ERR_PTR(ret)); + return ret; } static int lan78xx_urb_config_init(struct lan78xx_net *dev) @@ -2722,156 +2983,6 @@ static int lan78xx_urb_config_init(struct lan78xx_net *dev) return result; } -static int lan78xx_start_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enable) -{ - return lan78xx_update_reg(dev, reg, hw_enable, hw_enable); -} - -static int lan78xx_stop_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enabled, - u32 hw_disabled) -{ - unsigned long timeout; - bool stopped = true; - int ret; - u32 buf; - - /* Stop the h/w block (if not already stopped) */ - - ret = lan78xx_read_reg(dev, reg, &buf); - if (ret < 0) - return ret; - - if (buf & hw_enabled) { - buf &= ~hw_enabled; - - ret = lan78xx_write_reg(dev, reg, buf); - if (ret < 0) - return ret; - - stopped = false; - timeout = jiffies + HW_DISABLE_TIMEOUT; - do { - ret = lan78xx_read_reg(dev, reg, &buf); - if (ret < 0) - return ret; - - if (buf & hw_disabled) - stopped = true; - else - msleep(HW_DISABLE_DELAY_MS); - } while (!stopped && !time_after(jiffies, timeout)); - } - - ret = stopped ? 0 : -ETIME; - - return ret; -} - -static int lan78xx_flush_fifo(struct lan78xx_net *dev, u32 reg, u32 fifo_flush) -{ - return lan78xx_update_reg(dev, reg, fifo_flush, fifo_flush); -} - -static int lan78xx_start_tx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "start tx path"); - - /* Start the MAC transmitter */ - - ret = lan78xx_start_hw(dev, MAC_TX, MAC_TX_TXEN_); - if (ret < 0) - return ret; - - /* Start the Tx FIFO */ - - ret = lan78xx_start_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_); - if (ret < 0) - return ret; - - return 0; -} - -static int lan78xx_stop_tx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "stop tx path"); - - /* Stop the Tx FIFO */ - - ret = lan78xx_stop_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_, FCT_TX_CTL_DIS_); - if (ret < 0) - return ret; - - /* Stop the MAC transmitter */ - - ret = lan78xx_stop_hw(dev, MAC_TX, MAC_TX_TXEN_, MAC_TX_TXD_); - if (ret < 0) - return ret; - - return 0; -} - -/* The caller must ensure the Tx path is stopped before calling - * lan78xx_flush_tx_fifo(). - */ -static int lan78xx_flush_tx_fifo(struct lan78xx_net *dev) -{ - return lan78xx_flush_fifo(dev, FCT_TX_CTL, FCT_TX_CTL_RST_); -} - -static int lan78xx_start_rx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "start rx path"); - - /* Start the Rx FIFO */ - - ret = lan78xx_start_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_); - if (ret < 0) - return ret; - - /* Start the MAC receiver*/ - - ret = lan78xx_start_hw(dev, MAC_RX, MAC_RX_RXEN_); - if (ret < 0) - return ret; - - return 0; -} - -static int lan78xx_stop_rx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "stop rx path"); - - /* Stop the MAC receiver */ - - ret = lan78xx_stop_hw(dev, MAC_RX, MAC_RX_RXEN_, MAC_RX_RXD_); - if (ret < 0) - return ret; - - /* Stop the Rx FIFO */ - - ret = lan78xx_stop_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_, FCT_RX_CTL_DIS_); - if (ret < 0) - return ret; - - return 0; -} - -/* The caller must ensure the Rx path is stopped before calling - * lan78xx_flush_rx_fifo(). - */ -static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) -{ - return lan78xx_flush_fifo(dev, FCT_RX_CTL, FCT_RX_CTL_RST_); -} - static int lan78xx_reset(struct lan78xx_net *dev) { struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]); @@ -2905,7 +3016,9 @@ static int lan78xx_reset(struct lan78xx_net *dev) } } while (buf & HW_CFG_LRST_); - lan78xx_init_mac_address(dev); + ret = lan78xx_init_mac_address(dev); + if (ret < 0) + return ret; /* save DEVID for later usage */ ret = lan78xx_read_reg(dev, ID_REV, &buf); @@ -2927,7 +3040,9 @@ static int lan78xx_reset(struct lan78xx_net *dev) return ret; /* Init LTM */ - lan78xx_init_ltm(dev); + ret = lan78xx_init_ltm(dev); + if (ret < 0) + return ret; ret = lan78xx_write_reg(dev, BURST_CAP, dev->burst_cap); if (ret < 0) @@ -4242,9 +4357,6 @@ static void lan78xx_disconnect(struct usb_interface *intf) phydev = net->phydev; - phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); - phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0); - phy_disconnect(net->phydev); if (phy_is_pseudo_fixed_link(phydev)) { @@ -4349,7 +4461,7 @@ static int lan78xx_probe(struct usb_interface *intf, skb_queue_head_init(&dev->rxq_done); skb_queue_head_init(&dev->txq_pend); skb_queue_head_init(&dev->rxq_overflow); - mutex_init(&dev->phy_mutex); + mutex_init(&dev->mdiobus_mutex); mutex_init(&dev->dev_mutex); ret = lan78xx_urb_config_init(dev); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 07ebb800edf1..01251868a9c2 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -634,7 +634,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; @@ -646,7 +646,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 67d25f4f94ef..ca81b212a246 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -122,16 +122,6 @@ struct net_vrf { int ifindex; }; -static void vrf_rx_stats(struct net_device *dev, int len) -{ - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - u64_stats_update_begin(&dstats->syncp); - u64_stats_inc(&dstats->rx_packets); - u64_stats_add(&dstats->rx_bytes, len); - u64_stats_update_end(&dstats->syncp); -} - static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; @@ -369,7 +359,7 @@ static bool qdisc_tx_is_default(const struct net_device *dev) static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { - int len = skb->len; + unsigned int len = skb->len; skb_orphan(skb); @@ -382,15 +372,10 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, skb->protocol = eth_type_trans(skb, dev); - if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { - vrf_rx_stats(dev, len); - } else { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - u64_stats_update_begin(&dstats->syncp); - u64_stats_inc(&dstats->rx_drops); - u64_stats_update_end(&dstats->syncp); - } + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) + dev_dstats_rx_add(dev, len); + else + dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } @@ -578,20 +563,14 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - int len = skb->len; - netdev_tx_t ret = is_ip_tx_frame(skb, dev); - - u64_stats_update_begin(&dstats->syncp); - if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { + unsigned int len = skb->len; + netdev_tx_t ret; - u64_stats_inc(&dstats->tx_packets); - u64_stats_add(&dstats->tx_bytes, len); - } else { - u64_stats_inc(&dstats->tx_drops); - } - u64_stats_update_end(&dstats->syncp); + ret = is_ip_tx_frame(skb, dev); + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) + dev_dstats_tx_add(dev, len); + else + dev_dstats_tx_dropped(dev); return ret; } @@ -1364,7 +1343,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (!is_ndisc) { struct net_device *orig_dev = skb->dev; - vrf_rx_stats(vrf_dev, skb->len); + dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; @@ -1420,7 +1399,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, goto out; } - vrf_rx_stats(vrf_dev, skb->len); + dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 9ea63059d52d..0c356e0a61ef 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -622,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) +static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; + const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) @@ -1352,6 +1352,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; @@ -1364,7 +1365,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, @@ -1381,7 +1382,7 @@ skip_nh: } list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = vxlan_fdb_info(skb, vxlan, f, @@ -1554,18 +1555,17 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, - u32 vxflags) +static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { + const struct vxlanhdr *vh = vxlan_hdr(skb); enum skb_drop_reason reason; size_t start, offset; - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; + if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + return SKB_NOT_DROPPED_YET; - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); + start = vxlan_rco_start(vh->vx_vni); + offset = start + vxlan_rco_offset(vh->vx_vni); reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); if (reason) @@ -1573,22 +1573,20 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return SKB_NOT_DROPPED_YET; } -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, +static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + const struct vxlanhdr *vh = vxlan_hdr(skb); + const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; + gbp = (const struct vxlanhdr_gbp *)vh; + + if (!(vh->vx_flags & VXLAN_HF_GBP)) + return; md->gbp = ntohs(gbp->policy_id); @@ -1607,8 +1605,6 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, @@ -1672,9 +1668,9 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; + const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; - struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); @@ -1689,24 +1685,21 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto drop; - unparsed = *vxlan_hdr(skb); + vh = vxlan_hdr(skb); /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); + ntohl(vh->vx_flags), ntohl(vh->vx_vni)); reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) { @@ -1714,13 +1707,27 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - if (vs->flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) + if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || + vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { + /* If the header uses bits besides those enabled by the + * netdevice configuration, treat this as a malformed packet. + * This behavior diverges from VXLAN RFC (RFC7348) which + * stipulates that bits in reserved in reserved fields are to be + * ignored. The approach here maintains compatibility with + * previous stack code, and also is more robust and provides a + * little more security in adding extensions to VXLAN. + */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + if (vxlan->cfg.flags & VXLAN_F_GPE) { + if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; - unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } @@ -1730,8 +1737,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - if (vs->flags & VXLAN_F_REMCSUM_RX) { - reason = vxlan_remcsum(&unparsed, skb, vs->flags); + if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { + reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; } @@ -1756,25 +1763,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + if (vxlan->cfg.flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ - if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ - reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; - goto drop; - } - if (!raw_proto) { reason = vxlan_set_mac(vxlan, vs, skb, vni); if (reason) @@ -1818,14 +1812,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); - dev_core_stats_rx_dropped_inc(vxlan->dev); + dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); reason = SKB_DROP_REASON_DEV_READY; goto drop; } - dev_sw_netstats_rx_add(vxlan->dev, skb->len); + dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); @@ -1880,7 +1874,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); goto out; @@ -1938,7 +1932,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2097,7 +2091,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (netif_rx(reply) == NET_RX_DROP) { - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2271,8 +2265,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; + unsigned int len = skb->len; struct net_device *dev; - int len = skb->len; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; @@ -2299,16 +2293,16 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); - dev_sw_netstats_tx_add(src_vxlan->dev, 1, len); + dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { - dev_sw_netstats_rx_add(dst_vxlan->dev, len); + dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2621,7 +2615,7 @@ out_unlock: return; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, reason); return; @@ -2666,7 +2660,7 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, return; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2704,7 +2698,7 @@ static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, return NETDEV_TX_OK; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2801,7 +2795,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); @@ -3371,7 +3365,7 @@ static void vxlan_setup(struct net_device *dev) dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); @@ -3435,6 +3429,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), + [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4070,6 +4065,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { + struct vxlanhdr used_bits = { + .vx_flags = VXLAN_HF_VNI, + .vx_vni = VXLAN_VNI_MASK, + }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; @@ -4296,6 +4295,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + used_bits.vx_flags |= VXLAN_HF_RCO; + used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { @@ -4303,6 +4304,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], VXLAN_F_GBP, changelink, false, extack); if (err) return err; + used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { @@ -4311,6 +4313,46 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + + used_bits.vx_flags |= VXLAN_GPE_USED_BITS; + } + + if (data[IFLA_VXLAN_RESERVED_BITS]) { + struct vxlanhdr reserved_bits; + + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Cannot change reserved_bits"); + return -EOPNOTSUPP; + } + + nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], + sizeof(reserved_bits)); + if (used_bits.vx_flags & reserved_bits.vx_flags || + used_bits.vx_vni & reserved_bits.vx_vni) { + __be64 ub_be64, rb_be64; + + memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); + memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); + + NL_SET_ERR_MSG_ATTR_FMT(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Used bits %#018llx cannot overlap reserved bits %#018llx", + be64_to_cpu(ub_be64), + be64_to_cpu(rb_be64)); + return -EINVAL; + } + + conf->reserved_bits = reserved_bits; + } else { + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { @@ -4497,6 +4539,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + /* IFLA_VXLAN_RESERVED_BITS */ + nla_total_size(sizeof(struct vxlanhdr)) + 0; } @@ -4599,6 +4643,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; + if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, + sizeof(vxlan->cfg.reserved_bits), + &vxlan->cfg.reserved_bits)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/drivers/net/wan/framer/framer-core.c b/drivers/net/wan/framer/framer-core.c index f547c22e26ac..58f5143359df 100644 --- a/drivers/net/wan/framer/framer-core.c +++ b/drivers/net/wan/framer/framer-core.c @@ -732,8 +732,8 @@ EXPORT_SYMBOL_GPL(devm_framer_create); /** * framer_provider_simple_of_xlate() - returns the framer instance from framer provider - * @dev: the framer provider device - * @args: of_phandle_args (not used here) + * @dev: the framer provider device (not used here) + * @args: of_phandle_args * * Intended to be used by framer provider for the common case where #framer-cells is * 0. For other cases where #framer-cells is greater than '0', the framer provider @@ -743,21 +743,14 @@ EXPORT_SYMBOL_GPL(devm_framer_create); struct framer *framer_provider_simple_of_xlate(struct device *dev, const struct of_phandle_args *args) { - struct class_dev_iter iter; - struct framer *framer; - - class_dev_iter_init(&iter, &framer_class, NULL, NULL); - while ((dev = class_dev_iter_next(&iter))) { - framer = dev_to_framer(dev); - if (args->np != framer->dev.of_node) - continue; + struct device *target_dev; - class_dev_iter_exit(&iter); - return framer; - } + target_dev = class_find_device_by_of_node(&framer_class, args->np); + if (!target_dev) + return ERR_PTR(-ENODEV); - class_dev_iter_exit(&iter); - return ERR_PTR(-ENODEV); + put_device(target_dev); + return dev_to_framer(target_dev); } EXPORT_SYMBOL_GPL(framer_provider_simple_of_xlate); diff --git a/drivers/net/wwan/t7xx/t7xx_pci.c b/drivers/net/wwan/t7xx/t7xx_pci.c index 8381b0dc7acb..02f2ec7cf4ce 100644 --- a/drivers/net/wwan/t7xx/t7xx_pci.c +++ b/drivers/net/wwan/t7xx/t7xx_pci.c @@ -43,6 +43,8 @@ #include "t7xx_state_monitor.h" #include "t7xx_port_proxy.h" +#define DRIVER_NAME "mtk_t7xx" + #define T7XX_PCI_IREG_BASE 0 #define T7XX_PCI_EREG_BASE 2 @@ -833,6 +835,7 @@ static void t7xx_pci_infracfg_ao_calc(struct t7xx_pci_dev *t7xx_dev) static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct t7xx_pci_dev *t7xx_dev; + void __iomem *iomem; int ret; t7xx_dev = devm_kzalloc(&pdev->dev, sizeof(*t7xx_dev), GFP_KERNEL); @@ -848,12 +851,21 @@ static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); - ret = pcim_iomap_regions(pdev, BIT(T7XX_PCI_IREG_BASE) | BIT(T7XX_PCI_EREG_BASE), - pci_name(pdev)); + iomem = pcim_iomap_region(pdev, T7XX_PCI_IREG_BASE, DRIVER_NAME); + ret = PTR_ERR_OR_ZERO(iomem); + if (ret) { + dev_err(&pdev->dev, "Could not request IREG BAR: %d\n", ret); + return -ENOMEM; + } + IREG_BASE(t7xx_dev) = iomem; + + iomem = pcim_iomap_region(pdev, T7XX_PCI_EREG_BASE, DRIVER_NAME); + ret = PTR_ERR_OR_ZERO(iomem); if (ret) { - dev_err(&pdev->dev, "Could not request BARs: %d\n", ret); + dev_err(&pdev->dev, "Could not request EREG BAR: %d\n", ret); return -ENOMEM; } + t7xx_dev->base_addr.pcie_ext_reg_base = iomem; ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); if (ret) { @@ -867,9 +879,6 @@ static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - IREG_BASE(t7xx_dev) = pcim_iomap_table(pdev)[T7XX_PCI_IREG_BASE]; - t7xx_dev->base_addr.pcie_ext_reg_base = pcim_iomap_table(pdev)[T7XX_PCI_EREG_BASE]; - ret = t7xx_pci_pm_init(t7xx_dev); if (ret) return ret; @@ -937,7 +946,7 @@ static const struct pci_device_id t7xx_pci_table[] = { MODULE_DEVICE_TABLE(pci, t7xx_pci_table); static struct pci_driver t7xx_pci_driver = { - .name = "mtk_t7xx", + .name = DRIVER_NAME, .id_table = t7xx_pci_table, .probe = t7xx_pci_probe, .remove = t7xx_pci_remove, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e63dd3443b9..9ef57f6f584a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2589,10 +2589,10 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog); + const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress); + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -2862,15 +2862,15 @@ struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { return 0; } diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index ecf203f01034..9a1eacf35d37 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -81,7 +81,7 @@ static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per + * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. @@ -104,7 +104,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr) * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is all zeroes. + * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ @@ -123,7 +123,7 @@ static inline bool is_zero_ether_addr(const u8 *addr) * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a multicast address. + * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) @@ -157,7 +157,7 @@ static inline bool is_multicast_ether_addr_64bits(const u8 *addr) * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a local address. + * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { @@ -168,7 +168,7 @@ static inline bool is_local_ether_addr(const u8 *addr) * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is the broadcast address. + * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ @@ -183,7 +183,7 @@ static inline bool is_broadcast_ether_addr(const u8 *addr) * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a unicast address. + * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { @@ -197,7 +197,7 @@ static inline bool is_unicast_ether_addr(const u8 *addr) * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * - * Return true if the address is valid. + * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ @@ -214,7 +214,7 @@ static inline bool is_valid_ether_addr(const u8 *addr) * * Check that the value from the Ethertype/length field is a valid Ethertype. * - * Return true if the valid is an 802.3 supported Ethertype. + * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { @@ -458,7 +458,7 @@ static inline bool ether_addr_is_ip_mcast(const u8 *addr) * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return a u64 value of the address + * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b8b935b52603..f711bfd75c4d 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,7 +257,7 @@ struct ethtool_link_ksettings { * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * - * Returns true/false. + * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) @@ -711,6 +711,7 @@ struct ethtool_rxfh_param { * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none + * @phc_qualifier: qualifier of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -718,6 +719,7 @@ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; + enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; @@ -749,6 +751,7 @@ struct kernel_ethtool_ts_info { * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. + * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or @@ -966,6 +969,7 @@ struct ethtool_ops { u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; + u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); @@ -1199,7 +1203,7 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * - * Return number of phc vclocks + * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); @@ -1253,7 +1257,7 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result - * Returns zero on success, non-zero otherwise. + * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); diff --git a/include/linux/filter.h b/include/linux/filter.h index 0477254bc2d3..a3ea46281595 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1179,17 +1179,18 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *prog); + struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *prog); + const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, - struct bpf_prog *prog); + const struct bpf_prog *prog); void xdp_do_flush(void); -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d6326b53e336 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -310,7 +310,7 @@ static inline bool vlan_uses_dev(const struct net_device *dev) * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * - * Returns true if the ether type is a vlan ether type. + * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { @@ -341,9 +341,9 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, @@ -390,9 +390,9 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -533,7 +533,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not of VLAN type + * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -551,7 +551,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if @skb->vlan_tci is not set correctly + * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) @@ -570,7 +570,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not VLAN tagged + * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -587,7 +587,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, @@ -629,7 +629,7 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) @@ -710,7 +710,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * - * Returns a new pointer to skb->data, or NULL on failure to pull. + * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { @@ -727,7 +727,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * - * Returns true if the skb is tagged, regardless of whether it is hardware + * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) @@ -743,7 +743,7 @@ static inline bool skb_vlan_tagged(const struct sk_buff *skb) * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * - * Returns true if the skb is tagged with multiple vlan headers, regardless + * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) @@ -774,7 +774,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) * @skb: skbuff to query * @features: features to be checked * - * Returns features without unsafe ones if the skb has multiple tags. + * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 5171231f70a8..073b30a9b850 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -87,6 +87,8 @@ struct ip_mc_list { char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; + unsigned long mca_cstamp; + unsigned long mca_tstamp; struct rcu_head rcu; }; diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 3a4e723eae0f..383ed9985802 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -222,6 +222,11 @@ static inline ktime_t ns_to_ktime(u64 ns) return ns; } +static inline ktime_t us_to_ktime(u64 us) +{ + return us * NSEC_PER_USEC; +} + static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..43b3cb4bf8d1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1095,7 +1095,9 @@ struct mlx5_ifc_qos_cap_bits { u8 log_esw_max_sched_depth[0x4]; u8 reserved_at_10[0x10]; - u8 reserved_at_20[0xb]; + u8 reserved_at_20[0x9]; + u8 esw_cross_esw_sched[0x1]; + u8 reserved_at_2a[0x1]; u8 log_max_qos_nic_queue_group[0x5]; u8 reserved_at_30[0x10]; @@ -1103,7 +1105,8 @@ struct mlx5_ifc_qos_cap_bits { u8 packet_pacing_min_rate[0x20]; - u8 reserved_at_80[0x10]; + u8 reserved_at_80[0xb]; + u8 log_esw_max_rate_limit[0x5]; u8 packet_pacing_rate_table_size[0x10]; u8 esw_element_type[0x10]; @@ -1590,12 +1593,14 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, MLX5_STEERING_FORMAT_CONNECTX_7 = 2, + MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; - u8 reserved_at_7[0x9]; + u8 abs_native_port_num[0x1]; + u8 reserved_at_8[0x8]; u8 shared_object_to_user_object_allowed[0x1]; u8 reserved_at_13[0xe]; u8 vhca_resource_manager[0x1]; @@ -4103,6 +4108,7 @@ enum { SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2, SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3, SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT = 0x5, }; enum { @@ -4111,34 +4117,41 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, + ELEMENT_TYPE_CAP_MASK_RATE_LIMIT = 1 << 5, }; enum { TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB = 0x3, }; enum { TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, TSAR_TYPE_CAP_MASK_ETS = 1 << 2, + TSAR_TYPE_CAP_MASK_TC_ARB = 1 << 3, }; struct mlx5_ifc_tsar_element_bits { - u8 reserved_at_0[0x8]; + u8 traffic_class[0x4]; + u8 reserved_at_4[0x4]; u8 tsar_type[0x8]; u8 reserved_at_10[0x10]; }; struct mlx5_ifc_vport_element_bits { - u8 reserved_at_0[0x10]; + u8 reserved_at_0[0x4]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; struct mlx5_ifc_vport_tc_element_bits { u8 traffic_class[0x4]; - u8 reserved_at_4[0xc]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; @@ -4163,7 +4176,9 @@ struct mlx5_ifc_scheduling_context_bits { u8 max_average_bw[0x20]; - u8 reserved_at_e0[0x120]; + u8 max_bw_obj_id[0x20]; + + u8 reserved_at_100[0x100]; }; struct mlx5_ifc_rqtc_bits { @@ -6324,6 +6339,20 @@ struct mlx5_ifc_modify_other_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits other_capability; }; +struct mlx5_ifc_sw_owner_icm_root_params_bits { + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_rtc_params_bits { + u8 rtc_id_0[0x20]; + + u8 rtc_id_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_flow_table_context_bits { u8 reformat_en[0x1]; u8 decap_en[0x1]; @@ -6342,20 +6371,10 @@ struct mlx5_ifc_flow_table_context_bits { u8 lag_master_next_table_id[0x18]; u8 reserved_at_60[0x60]; - union { - struct { - u8 sw_owner_icm_root_1[0x40]; - - u8 sw_owner_icm_root_0[0x40]; - } sws; - struct { - u8 rtc_id_0[0x20]; - u8 rtc_id_1[0x20]; - - u8 reserved_at_100[0x40]; - - } hws; + union { + struct mlx5_ifc_sw_owner_icm_root_params_bits sws; + struct mlx5_ifc_rtc_params_bits hws; }; }; diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 662074b08c94..ff0758e88ea1 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -20,6 +20,33 @@ enum hwtstamp_source { }; /** + * struct hwtstamp_provider_desc - hwtstamp provider description + * + * @index: index of the hwtstamp provider. + * @qualifier: hwtstamp provider qualifier. + */ +struct hwtstamp_provider_desc { + int index; + enum hwtstamp_provider_qualifier qualifier; +}; + +/** + * struct hwtstamp_provider - hwtstamp provider object + * + * @rcu_head: RCU callback used to free the struct. + * @source: source of the hwtstamp provider. + * @phydev: pointer of the phydev source in case a PTP coming from phylib + * @desc: hwtstamp provider description. + */ + +struct hwtstamp_provider { + struct rcu_head rcu_head; + enum hwtstamp_source source; + struct phy_device *phydev; + struct hwtstamp_provider_desc desc; +}; + +/** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * * @flags: see struct hwtstamp_config @@ -31,6 +58,7 @@ enum hwtstamp_source { * copied the ioctl request back to user space * @source: indication whether timestamps should come from the netdev or from * an attached phylib PHY + * @qualifier: qualifier of the hwtstamp provider * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -43,6 +71,7 @@ struct kernel_hwtstamp_config { struct ifreq *ifr; bool copied_to_user; enum hwtstamp_source source; + enum hwtstamp_provider_qualifier qualifier; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecc686409161..2593019ad5b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -82,6 +82,7 @@ struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; +struct hwtstamp_provider; typedef u32 xdp_features_t; @@ -509,7 +510,7 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n) * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * - * Return True if NAPI is scheduled, False otherwise. + * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { @@ -524,7 +525,7 @@ bool napi_schedule_prep(struct napi_struct *n); * * Schedule NAPI poll routine to be called if it is not already * running. - * Return true if we schedule a NAPI or false if not. + * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ @@ -558,7 +559,7 @@ static inline void napi_schedule_irqoff(struct napi_struct *n) * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). - * Return false if device should avoid rearming interrupts. + * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); @@ -2045,6 +2046,7 @@ enum netdev_reg_state { * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. + * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2457,6 +2459,8 @@ struct net_device { struct hlist_head neighbours[NEIGH_NR_TABLES]; + struct hwtstamp_provider __rcu *hwprov; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; @@ -2854,6 +2858,46 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) u64_stats_update_end(&lstats->syncp); } +static inline void dev_dstats_rx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_packets); + u64_stats_add(&dstats->rx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_rx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_drops); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_packets); + u64_stats_add(&dstats->tx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_drops); + u64_stats_update_end(&dstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ @@ -3322,6 +3366,7 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DECLARE_PER_CPU(struct page_pool *, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) @@ -3810,7 +3855,7 @@ static inline bool netif_attr_test_mask(unsigned long j, * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * - * Returns true if a CPU/Rx queue is online. + * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, @@ -3830,7 +3875,8 @@ static inline bool netif_attr_test_online(unsigned long j, * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set. + * Returns: next (after n) CPU/Rx queue index in the mask; + * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) @@ -3852,7 +3898,8 @@ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set in both. + * Returns: next (after n) CPU/Rx queue index set in both masks; + * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, @@ -3958,9 +4005,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog); -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); + const struct bpf_prog *xdp_prog); +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5897f3dbaf7c..f39f688d7285 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -357,7 +357,7 @@ extern struct static_key xt_tee_enabled; * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) - * Returns : + * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 8676316547cc..3175073a66ba 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -66,7 +66,6 @@ static inline bool nf_hook_egress_active(void) * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * - * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. @@ -81,6 +80,8 @@ static inline bool nf_hook_egress_active(void) * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. + * + * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b34301650c47..f91e50a76efd 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,7 @@ static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } #endif -void netpoll_send_udp(struct netpoll *np, const char *msg, int len); +int netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); int __netpoll_setup(struct netpoll *np, struct net_device *ndev); diff --git a/include/linux/packing.h b/include/linux/packing.h index 5d36dcd06f60..0589d70bbe04 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -8,6 +8,83 @@ #include <linux/types.h> #include <linux/bitops.h> +#define GEN_PACKED_FIELD_STRUCT(__type) \ + struct packed_field_ ## __type { \ + __type startbit; \ + __type endbit; \ + __type offset; \ + __type size; \ + } + +/* struct packed_field_u8. Use with bit offsets < 256, buffers < 32B and + * unpacked structures < 256B. + */ +GEN_PACKED_FIELD_STRUCT(u8); + +/* struct packed_field_u16. Use with bit offsets < 65536, buffers < 8KB and + * unpacked structures < 64KB. + */ +GEN_PACKED_FIELD_STRUCT(u16); + +#define PACKED_FIELD(start, end, struct_name, struct_field) \ +{ \ + (start), \ + (end), \ + offsetof(struct_name, struct_field), \ + sizeof_field(struct_name, struct_field), \ +} + +#define CHECK_PACKED_FIELD_OVERLAP(fields, index1, index2) ({ \ + typeof(&(fields)[0]) __f = (fields); \ + typeof(__f[0]) _f1 = __f[index1]; typeof(__f[0]) _f2 = __f[index2]; \ + const bool _ascending = __f[0].startbit < __f[1].startbit; \ + BUILD_BUG_ON_MSG(_ascending && _f1.startbit >= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks ascending order"); \ + BUILD_BUG_ON_MSG(!_ascending && _f1.startbit <= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks descending order"); \ + BUILD_BUG_ON_MSG(max(_f1.endbit, _f2.endbit) <= \ + min(_f1.startbit, _f2.startbit), \ + __stringify(fields) " field " __stringify(index2) \ + " overlaps with previous field"); \ +}) + +#define CHECK_PACKED_FIELD(fields, index) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(_f[0]) __f = _f[index]; \ + BUILD_BUG_ON_MSG(__f.startbit < __f.endbit, \ + __stringify(fields) " field " __stringify(index) \ + " start bit must not be smaller than end bit"); \ + BUILD_BUG_ON_MSG(__f.size != 1 && __f.size != 2 && \ + __f.size != 4 && __f.size != 8, \ + __stringify(fields) " field " __stringify(index) \ + " has unsupported unpacked storage size"); \ + BUILD_BUG_ON_MSG(__f.startbit - __f.endbit >= BITS_PER_BYTE * __f.size, \ + __stringify(fields) " field " __stringify(index) \ + " exceeds unpacked storage size"); \ + __builtin_choose_expr(index != 0, \ + CHECK_PACKED_FIELD_OVERLAP(fields, index - 1, index), \ + 1); \ +}) + +/* Note that the packed fields may be either in ascending or descending order. + * Thus, we must check that both the first and last field wit within the + * packed buffer size. + */ +#define CHECK_PACKED_FIELDS_SIZE(fields, pbuflen) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(pbuflen) _len = (pbuflen); \ + const size_t num_fields = ARRAY_SIZE(fields); \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_len), \ + __stringify(fields) " pbuflen " __stringify(pbuflen) \ + " must be a compile time constant"); \ + BUILD_BUG_ON_MSG(_f[0].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " first field exceeds packed buffer size"); \ + BUILD_BUG_ON_MSG(_f[num_fields - 1].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " last field exceeds packed buffer size"); \ +}) + #define QUIRK_MSB_ON_THE_RIGHT BIT(0) #define QUIRK_LITTLE_ENDIAN BIT(1) #define QUIRK_LSW32_IS_FIRST BIT(2) @@ -26,4 +103,352 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +/* Do not hand-edit the following packed field check macros! + * + * They are generated using scripts/gen_packed_field_checks.c, which may be + * built via "make scripts_gen_packed_field_checks". If larger macro sizes are + * needed in the future, please use this program to re-generate the macros and + * insert them here. + */ + +#define CHECK_PACKED_FIELDS_1(fields) \ + CHECK_PACKED_FIELD(fields, 0) + +#define CHECK_PACKED_FIELDS_2(fields) do { \ + CHECK_PACKED_FIELDS_1(fields); \ + CHECK_PACKED_FIELD(fields, 1); \ +} while (0) + +#define CHECK_PACKED_FIELDS_3(fields) do { \ + CHECK_PACKED_FIELDS_2(fields); \ + CHECK_PACKED_FIELD(fields, 2); \ +} while (0) + +#define CHECK_PACKED_FIELDS_4(fields) do { \ + CHECK_PACKED_FIELDS_3(fields); \ + CHECK_PACKED_FIELD(fields, 3); \ +} while (0) + +#define CHECK_PACKED_FIELDS_5(fields) do { \ + CHECK_PACKED_FIELDS_4(fields); \ + CHECK_PACKED_FIELD(fields, 4); \ +} while (0) + +#define CHECK_PACKED_FIELDS_6(fields) do { \ + CHECK_PACKED_FIELDS_5(fields); \ + CHECK_PACKED_FIELD(fields, 5); \ +} while (0) + +#define CHECK_PACKED_FIELDS_7(fields) do { \ + CHECK_PACKED_FIELDS_6(fields); \ + CHECK_PACKED_FIELD(fields, 6); \ +} while (0) + +#define CHECK_PACKED_FIELDS_8(fields) do { \ + CHECK_PACKED_FIELDS_7(fields); \ + CHECK_PACKED_FIELD(fields, 7); \ +} while (0) + +#define CHECK_PACKED_FIELDS_9(fields) do { \ + CHECK_PACKED_FIELDS_8(fields); \ + CHECK_PACKED_FIELD(fields, 8); \ +} while (0) + +#define CHECK_PACKED_FIELDS_10(fields) do { \ + CHECK_PACKED_FIELDS_9(fields); \ + CHECK_PACKED_FIELD(fields, 9); \ +} while (0) + +#define CHECK_PACKED_FIELDS_11(fields) do { \ + CHECK_PACKED_FIELDS_10(fields); \ + CHECK_PACKED_FIELD(fields, 10); \ +} while (0) + +#define CHECK_PACKED_FIELDS_12(fields) do { \ + CHECK_PACKED_FIELDS_11(fields); \ + CHECK_PACKED_FIELD(fields, 11); \ +} while (0) + +#define CHECK_PACKED_FIELDS_13(fields) do { \ + CHECK_PACKED_FIELDS_12(fields); \ + CHECK_PACKED_FIELD(fields, 12); \ +} while (0) + +#define CHECK_PACKED_FIELDS_14(fields) do { \ + CHECK_PACKED_FIELDS_13(fields); \ + CHECK_PACKED_FIELD(fields, 13); \ +} while (0) + +#define CHECK_PACKED_FIELDS_15(fields) do { \ + CHECK_PACKED_FIELDS_14(fields); \ + CHECK_PACKED_FIELD(fields, 14); \ +} while (0) + +#define CHECK_PACKED_FIELDS_16(fields) do { \ + CHECK_PACKED_FIELDS_15(fields); \ + CHECK_PACKED_FIELD(fields, 15); \ +} while (0) + +#define CHECK_PACKED_FIELDS_17(fields) do { \ + CHECK_PACKED_FIELDS_16(fields); \ + CHECK_PACKED_FIELD(fields, 16); \ +} while (0) + +#define CHECK_PACKED_FIELDS_18(fields) do { \ + CHECK_PACKED_FIELDS_17(fields); \ + CHECK_PACKED_FIELD(fields, 17); \ +} while (0) + +#define CHECK_PACKED_FIELDS_19(fields) do { \ + CHECK_PACKED_FIELDS_18(fields); \ + CHECK_PACKED_FIELD(fields, 18); \ +} while (0) + +#define CHECK_PACKED_FIELDS_20(fields) do { \ + CHECK_PACKED_FIELDS_19(fields); \ + CHECK_PACKED_FIELD(fields, 19); \ +} while (0) + +#define CHECK_PACKED_FIELDS_21(fields) do { \ + CHECK_PACKED_FIELDS_20(fields); \ + CHECK_PACKED_FIELD(fields, 20); \ +} while (0) + +#define CHECK_PACKED_FIELDS_22(fields) do { \ + CHECK_PACKED_FIELDS_21(fields); \ + CHECK_PACKED_FIELD(fields, 21); \ +} while (0) + +#define CHECK_PACKED_FIELDS_23(fields) do { \ + CHECK_PACKED_FIELDS_22(fields); \ + CHECK_PACKED_FIELD(fields, 22); \ +} while (0) + +#define CHECK_PACKED_FIELDS_24(fields) do { \ + CHECK_PACKED_FIELDS_23(fields); \ + CHECK_PACKED_FIELD(fields, 23); \ +} while (0) + +#define CHECK_PACKED_FIELDS_25(fields) do { \ + CHECK_PACKED_FIELDS_24(fields); \ + CHECK_PACKED_FIELD(fields, 24); \ +} while (0) + +#define CHECK_PACKED_FIELDS_26(fields) do { \ + CHECK_PACKED_FIELDS_25(fields); \ + CHECK_PACKED_FIELD(fields, 25); \ +} while (0) + +#define CHECK_PACKED_FIELDS_27(fields) do { \ + CHECK_PACKED_FIELDS_26(fields); \ + CHECK_PACKED_FIELD(fields, 26); \ +} while (0) + +#define CHECK_PACKED_FIELDS_28(fields) do { \ + CHECK_PACKED_FIELDS_27(fields); \ + CHECK_PACKED_FIELD(fields, 27); \ +} while (0) + +#define CHECK_PACKED_FIELDS_29(fields) do { \ + CHECK_PACKED_FIELDS_28(fields); \ + CHECK_PACKED_FIELD(fields, 28); \ +} while (0) + +#define CHECK_PACKED_FIELDS_30(fields) do { \ + CHECK_PACKED_FIELDS_29(fields); \ + CHECK_PACKED_FIELD(fields, 29); \ +} while (0) + +#define CHECK_PACKED_FIELDS_31(fields) do { \ + CHECK_PACKED_FIELDS_30(fields); \ + CHECK_PACKED_FIELD(fields, 30); \ +} while (0) + +#define CHECK_PACKED_FIELDS_32(fields) do { \ + CHECK_PACKED_FIELDS_31(fields); \ + CHECK_PACKED_FIELD(fields, 31); \ +} while (0) + +#define CHECK_PACKED_FIELDS_33(fields) do { \ + CHECK_PACKED_FIELDS_32(fields); \ + CHECK_PACKED_FIELD(fields, 32); \ +} while (0) + +#define CHECK_PACKED_FIELDS_34(fields) do { \ + CHECK_PACKED_FIELDS_33(fields); \ + CHECK_PACKED_FIELD(fields, 33); \ +} while (0) + +#define CHECK_PACKED_FIELDS_35(fields) do { \ + CHECK_PACKED_FIELDS_34(fields); \ + CHECK_PACKED_FIELD(fields, 34); \ +} while (0) + +#define CHECK_PACKED_FIELDS_36(fields) do { \ + CHECK_PACKED_FIELDS_35(fields); \ + CHECK_PACKED_FIELD(fields, 35); \ +} while (0) + +#define CHECK_PACKED_FIELDS_37(fields) do { \ + CHECK_PACKED_FIELDS_36(fields); \ + CHECK_PACKED_FIELD(fields, 36); \ +} while (0) + +#define CHECK_PACKED_FIELDS_38(fields) do { \ + CHECK_PACKED_FIELDS_37(fields); \ + CHECK_PACKED_FIELD(fields, 37); \ +} while (0) + +#define CHECK_PACKED_FIELDS_39(fields) do { \ + CHECK_PACKED_FIELDS_38(fields); \ + CHECK_PACKED_FIELD(fields, 38); \ +} while (0) + +#define CHECK_PACKED_FIELDS_40(fields) do { \ + CHECK_PACKED_FIELDS_39(fields); \ + CHECK_PACKED_FIELD(fields, 39); \ +} while (0) + +#define CHECK_PACKED_FIELDS_41(fields) do { \ + CHECK_PACKED_FIELDS_40(fields); \ + CHECK_PACKED_FIELD(fields, 40); \ +} while (0) + +#define CHECK_PACKED_FIELDS_42(fields) do { \ + CHECK_PACKED_FIELDS_41(fields); \ + CHECK_PACKED_FIELD(fields, 41); \ +} while (0) + +#define CHECK_PACKED_FIELDS_43(fields) do { \ + CHECK_PACKED_FIELDS_42(fields); \ + CHECK_PACKED_FIELD(fields, 42); \ +} while (0) + +#define CHECK_PACKED_FIELDS_44(fields) do { \ + CHECK_PACKED_FIELDS_43(fields); \ + CHECK_PACKED_FIELD(fields, 43); \ +} while (0) + +#define CHECK_PACKED_FIELDS_45(fields) do { \ + CHECK_PACKED_FIELDS_44(fields); \ + CHECK_PACKED_FIELD(fields, 44); \ +} while (0) + +#define CHECK_PACKED_FIELDS_46(fields) do { \ + CHECK_PACKED_FIELDS_45(fields); \ + CHECK_PACKED_FIELD(fields, 45); \ +} while (0) + +#define CHECK_PACKED_FIELDS_47(fields) do { \ + CHECK_PACKED_FIELDS_46(fields); \ + CHECK_PACKED_FIELD(fields, 46); \ +} while (0) + +#define CHECK_PACKED_FIELDS_48(fields) do { \ + CHECK_PACKED_FIELDS_47(fields); \ + CHECK_PACKED_FIELD(fields, 47); \ +} while (0) + +#define CHECK_PACKED_FIELDS_49(fields) do { \ + CHECK_PACKED_FIELDS_48(fields); \ + CHECK_PACKED_FIELD(fields, 48); \ +} while (0) + +#define CHECK_PACKED_FIELDS_50(fields) do { \ + CHECK_PACKED_FIELDS_49(fields); \ + CHECK_PACKED_FIELD(fields, 49); \ +} while (0) + +#define CHECK_PACKED_FIELDS(fields) \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 1, ({ CHECK_PACKED_FIELDS_1(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 2, ({ CHECK_PACKED_FIELDS_2(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 3, ({ CHECK_PACKED_FIELDS_3(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 4, ({ CHECK_PACKED_FIELDS_4(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 5, ({ CHECK_PACKED_FIELDS_5(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 6, ({ CHECK_PACKED_FIELDS_6(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 7, ({ CHECK_PACKED_FIELDS_7(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 8, ({ CHECK_PACKED_FIELDS_8(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 9, ({ CHECK_PACKED_FIELDS_9(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 10, ({ CHECK_PACKED_FIELDS_10(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 11, ({ CHECK_PACKED_FIELDS_11(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 12, ({ CHECK_PACKED_FIELDS_12(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 13, ({ CHECK_PACKED_FIELDS_13(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 14, ({ CHECK_PACKED_FIELDS_14(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 15, ({ CHECK_PACKED_FIELDS_15(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 16, ({ CHECK_PACKED_FIELDS_16(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 17, ({ CHECK_PACKED_FIELDS_17(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 18, ({ CHECK_PACKED_FIELDS_18(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 19, ({ CHECK_PACKED_FIELDS_19(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 20, ({ CHECK_PACKED_FIELDS_20(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 21, ({ CHECK_PACKED_FIELDS_21(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 22, ({ CHECK_PACKED_FIELDS_22(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 23, ({ CHECK_PACKED_FIELDS_23(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 24, ({ CHECK_PACKED_FIELDS_24(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 25, ({ CHECK_PACKED_FIELDS_25(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 26, ({ CHECK_PACKED_FIELDS_26(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 27, ({ CHECK_PACKED_FIELDS_27(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 28, ({ CHECK_PACKED_FIELDS_28(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 29, ({ CHECK_PACKED_FIELDS_29(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 30, ({ CHECK_PACKED_FIELDS_30(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 31, ({ CHECK_PACKED_FIELDS_31(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 32, ({ CHECK_PACKED_FIELDS_32(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 33, ({ CHECK_PACKED_FIELDS_33(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 34, ({ CHECK_PACKED_FIELDS_34(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 35, ({ CHECK_PACKED_FIELDS_35(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 36, ({ CHECK_PACKED_FIELDS_36(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 37, ({ CHECK_PACKED_FIELDS_37(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 38, ({ CHECK_PACKED_FIELDS_38(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 39, ({ CHECK_PACKED_FIELDS_39(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 40, ({ CHECK_PACKED_FIELDS_40(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 41, ({ CHECK_PACKED_FIELDS_41(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 42, ({ CHECK_PACKED_FIELDS_42(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 43, ({ CHECK_PACKED_FIELDS_43(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 44, ({ CHECK_PACKED_FIELDS_44(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 45, ({ CHECK_PACKED_FIELDS_45(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 46, ({ CHECK_PACKED_FIELDS_46(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 47, ({ CHECK_PACKED_FIELDS_47(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 48, ({ CHECK_PACKED_FIELDS_48(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 49, ({ CHECK_PACKED_FIELDS_49(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 50, ({ CHECK_PACKED_FIELDS_50(fields); }), \ + ({ BUILD_BUG_ON_MSG(1, "CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than 50."); }) \ +)))))))))))))))))))))))))))))))))))))))))))))))))) + +/* End of generated content */ + +#define pack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : pack_fields_u8, \ + const struct packed_field_u16 * : pack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + +#define unpack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : unpack_fields_u8, \ + const struct packed_field_u16 * : unpack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + #endif diff --git a/include/linux/phy.h b/include/linux/phy.h index 563c46205685..e597a32cc787 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -298,6 +298,29 @@ static inline const char *phy_modes(phy_interface_t interface) } } +/** + * rgmii_clock - map link speed to the clock rate + * @speed: link speed value + * + * Description: maps RGMII supported link speeds + * into the clock rates. + * + * Returns: clock rate or negative errno + */ +static inline long rgmii_clock(int speed) +{ + switch (speed) { + case SPEED_10: + return 2500000; + case SPEED_100: + return 25000000; + case SPEED_1000: + return 125000000; + default: + return -EINVAL; + } +} + #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 @@ -818,6 +841,24 @@ struct phy_tdr_config { #define PHY_PAIR_ALL -1 /** + * enum link_inband_signalling - in-band signalling modes that are supported + * + * @LINK_INBAND_DISABLE: in-band signalling can be disabled + * @LINK_INBAND_ENABLE: in-band signalling can be enabled without bypass + * @LINK_INBAND_BYPASS: in-band signalling can be enabled with bypass + * + * The possible and required bits can only be used if the valid bit is set. + * If possible is clear, that means inband signalling can not be used. + * Required is only valid when possible is set, and means that inband + * signalling must be used. + */ +enum link_inband_signalling { + LINK_INBAND_DISABLE = BIT(0), + LINK_INBAND_ENABLE = BIT(1), + LINK_INBAND_BYPASS = BIT(2), +}; + +/** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * @@ -957,6 +998,19 @@ struct phy_driver { int (*get_features)(struct phy_device *phydev); /** + * @inband_caps: query whether in-band is supported for the given PHY + * interface mode. Returns a bitmask of bits defined by enum + * link_inband_signalling. + */ + unsigned int (*inband_caps)(struct phy_device *phydev, + phy_interface_t interface); + + /** + * @config_inband: configure in-band mode for the PHY + */ + int (*config_inband)(struct phy_device *phydev, unsigned int modes); + + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine * whether to advertise lower-speed modes for that interface. It is @@ -1818,6 +1872,9 @@ int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface); +int phy_config_inband(struct phy_device *phydev, unsigned int modes); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); @@ -1957,7 +2014,7 @@ int genphy_c45_plca_set_cfg(struct phy_device *phydev, int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled); + unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c01048860c4..5462cc6a37dc 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -419,6 +419,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. * @pcs_validate: validate the link configuration. + * @pcs_inband_caps: query inband support for interface mode. * @pcs_enable: enable the PCS. * @pcs_disable: disable the PCS. * @pcs_pre_config: pre-mac_config method (for errata) @@ -434,6 +435,8 @@ struct phylink_pcs { struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); + unsigned int (*pcs_inband_caps)(struct phylink_pcs *pcs, + phy_interface_t interface); int (*pcs_enable)(struct phylink_pcs *pcs); void (*pcs_disable)(struct phylink_pcs *pcs); void (*pcs_pre_config)(struct phylink_pcs *pcs, @@ -471,6 +474,20 @@ int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); /** + * pcs_inband_caps - query PCS in-band capabilities for interface mode. + * @pcs: a pointer to a &struct phylink_pcs. + * @interface: interface mode to be queried + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * for this interface mode. + */ +unsigned int pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface); + +/** * pcs_enable() - enable the PCS. * @pcs: a pointer to a &struct phylink_pcs. */ diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index c892d22ce0a7..0d68d09bedd1 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -307,7 +307,7 @@ static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. * - * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * Returns: a valid pointer on success or PTR_ERR on failure. If PHC * support is missing at the configuration level, this function * returns NULL, and drivers are expected to gracefully handle that * case separately. @@ -445,7 +445,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * - * Returns converted timestamp, or 0 on error. + * Returns: converted timestamp, or 0 on error. */ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index fd037c127bb0..551329220e4f 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -615,15 +615,14 @@ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. - * In particular if you consume ring in interrupt or BH context, you must - * disable interrupts/BH when doing so. + * In particular if you consume ring in BH context, you must + * disable BH when doing so. */ -static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, - unsigned int nrings, - int size, - gfp_t gfp, void (*destroy)(void *)) +static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, + unsigned int nrings, + int size, gfp_t gfp, + void (*destroy)(void *)) { - unsigned long flags; void ***queues; int i; @@ -638,12 +637,12 @@ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, } for (i = 0; i < nrings; ++i) { - spin_lock_irqsave(&(rings[i])->consumer_lock, flags); + spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); - spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); + spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) @@ -662,8 +661,8 @@ nomem: noqueues: return -ENOMEM; } -#define ptr_ring_resize_multiple(...) \ - alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) +#define ptr_ring_resize_multiple_bh(...) \ + alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 997b34197385..6816e4c5f3f0 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -241,7 +241,7 @@ bool rfkill_soft_blocked(struct rfkill *rfkill); * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that corresponds to the name. + * Returns: enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 14b88f551920..3b9d132cbc9e 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -78,7 +78,7 @@ static inline bool lockdep_rtnl_is_held(void) * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit + * Return: the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ @@ -178,6 +178,12 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); +/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ +struct ndo_fdb_dump_context { + unsigned long ifindex; + unsigned long fdb_idx; +}; + extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 926496c9cc9c..bf178238a308 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -199,17 +199,18 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } -static inline int skb_array_resize_multiple_noprof(struct skb_array **rings, - int nrings, unsigned int size, - gfp_t gfp) +static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings, + int nrings, + unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); - return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings, - nrings, size, gfp, - __skb_array_destroy_skb); + return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings, + nrings, size, gfp, + __skb_array_destroy_skb); } -#define skb_array_resize_multiple(...) \ - alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__)) +#define skb_array_resize_multiple_bh(...) \ + alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 58009fa66102..b2509cd0b930 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1134,7 +1134,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb) * skb_dst - returns skb dst_entry * @skb: buffer * - * Returns skb dst_entry, regardless of reference taken or not. + * Returns: skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { @@ -1222,7 +1222,7 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) * skb_unref - decrement the skb's reference count * @skb: buffer * - * Returns true if we can free the skb. + * Returns: true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { @@ -1344,7 +1344,7 @@ struct sk_buff_fclones { * @sk: socket * @skb: buffer * - * Returns true if skb is a fast clone, and its clone is not freed. + * Returns: true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ @@ -3516,7 +3516,7 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * - * Returns false if this page should be returned to page allocator, true + * Returns: false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) @@ -3627,13 +3627,13 @@ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog); + const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. The page must already + * Returns: the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) @@ -3648,7 +3648,7 @@ static inline void *skb_frag_address(const skb_frag_t *frag) * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. Checks that the page + * Returns: the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) @@ -3674,7 +3674,7 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto, bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** - * skb_frag_dma_map - maps a paged fragment via the DMA API + * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the @@ -3684,15 +3684,36 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); * * Maps the page associated with @frag to @device. */ -static inline dma_addr_t skb_frag_dma_map(struct device *dev, - const skb_frag_t *frag, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t __skb_frag_dma_map(struct device *dev, + const skb_frag_t *frag, + size_t offset, size_t size, + enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } +#define skb_frag_dma_map(dev, frag, ...) \ + CONCATENATE(_skb_frag_dma_map, \ + COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) + +#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ + const skb_frag_t *uf = (frag); \ + size_t uo = (offset); \ + \ + __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ + DMA_TO_DEVICE); \ +}) +#define _skb_frag_dma_map1(dev, frag, offset) \ + __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ + __UNIQUE_ID(offset_)) +#define _skb_frag_dma_map0(dev, frag) \ + _skb_frag_dma_map1(dev, frag, 0) +#define _skb_frag_dma_map2(dev, frag, offset, size) \ + __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) +#define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ + __skb_frag_dma_map(dev, frag, offset, size, dir) + static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { @@ -3890,7 +3911,7 @@ static inline int skb_linearize(struct sk_buff *skb) * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * - * Return true if the skb has at least one frag that might be modified + * Return: true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) @@ -4612,7 +4633,7 @@ static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) /* Check if we need to perform checksum complete validation. * - * Returns true if checksum complete is needed, false otherwise + * Returns: true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d79ff252cfdc..c9878a612e53 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -33,7 +33,9 @@ #define STMMAC_CSR_20_35M 0x2 /* MDC = clk_scr_i/16 */ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ -#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/122 */ +#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ +#define STMMAC_CSR_300_500M 0x6 /* MDC = clk_scr_i/204 */ +#define STMMAC_CSR_500_800M 0x7 /* MDC = clk_scr_i/324 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 @@ -250,8 +252,8 @@ struct plat_stmmacenet_data { struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - unsigned int clk_ptp_rate; - unsigned int clk_ref_rate; + unsigned long clk_ptp_rate; + unsigned long clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; u32 cdc_error_adj; @@ -263,7 +265,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned int eee_usecs_rate; + unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 79c781875c09..a4d6cc0c9f68 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -97,7 +97,7 @@ struct wwan_port_caps { * * This function must be balanced with a call to wwan_remove_port(). * - * Returns a valid pointer to wwan_port on success or PTR_ERR on failure + * Returns: a valid pointer to wwan_port on success or PTR_ERR on failure */ struct wwan_port *wwan_create_port(struct device *parent, enum wwan_port_type type, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 363dd63babe7..58337898fa21 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -88,6 +88,23 @@ struct ifa6_config { u16 scope; }; +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; + bool force_rt_scope_universe; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -525,4 +542,8 @@ int if6_proc_init(void); void if6_proc_exit(void); #endif +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args); + #endif diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27acf1292a5c..182f7965048f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5957,7 +5957,7 @@ int wiphy_register(struct wiphy *wiphy); * @wiphy: the wiphy to check the locking on * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the + * Return: the value of the specified RCU-protected pointer, but omit the * READ_ONCE(), because caller holds the wiphy mutex used for updates. */ #define wiphy_dereference(wiphy, p) \ diff --git a/include/net/devlink.h b/include/net/devlink.h index fbb9a2668e24..58e33959c852 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1261,6 +1261,18 @@ enum devlink_trap_group_generic_id { .min_burst = _min_burst, \ } +#define devlink_fmsg_put(fmsg, name, value) ( \ + _Generic((value), \ + bool : devlink_fmsg_bool_pair_put, \ + u8 : devlink_fmsg_u8_pair_put, \ + u16 : devlink_fmsg_u32_pair_put, \ + u32 : devlink_fmsg_u32_pair_put, \ + u64 : devlink_fmsg_u64_pair_put, \ + int : devlink_fmsg_u32_pair_put, \ + char * : devlink_fmsg_string_pair_put, \ + const char * : devlink_fmsg_string_pair_put) \ + (fmsg, name, (value))) + enum { /* device supports reload operations */ DEVLINK_F_RELOAD = 1UL << 0, @@ -1994,6 +2006,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb); #else diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 6c5a1ea209a2..ead4170a1d0a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -58,6 +58,12 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(QDISC_OVERLIMIT) \ + FN(QDISC_CONGESTED) \ + FN(CAKE_FLOOD) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ FN(CPU_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ @@ -312,6 +318,36 @@ enum skb_drop_reason { */ SKB_DROP_REASON_QDISC_DROP, /** + * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc + * instance exceeds its total buffer size limit. + */ + SKB_DROP_REASON_QDISC_OVERLIMIT, + /** + * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm + * due to congestion. + */ + SKB_DROP_REASON_QDISC_CONGESTED, + /** + * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of + * CAKE qdisc AQM algorithm (BLUE). + */ + SKB_DROP_REASON_CAKE_FLOOD, + /** + * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band + * limit is reached. + */ + SKB_DROP_REASON_FQ_BAND_LIMIT, + /** + * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet + * timestamp is too far in the future. + */ + SKB_DROP_REASON_FQ_HORIZON_LIMIT, + /** + * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow + * exceeds its limits. + */ + SKB_DROP_REASON_FQ_FLOW_LIMIT, + /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see * netdev_max_backlog in net.rst) or RPS flow limit diff --git a/include/net/dsa.h b/include/net/dsa.h index 72ae65e7246a..4aeedb296d67 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -988,6 +988,7 @@ struct dsa_switch_ops { /* * Port's MAC EEE settings */ + bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, @@ -1383,5 +1384,6 @@ static inline bool dsa_user_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif diff --git a/include/net/dst.h b/include/net/dst.h index 0f303cc60252..78c78cdce0e9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -307,7 +307,7 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. - * Returns true if dst is refcounted. + * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { @@ -440,6 +440,15 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) dst->expires = expires; } +static inline unsigned int dst_dev_overhead(struct dst_entry *dst, + struct sk_buff *skb) +{ + if (likely(dst)) + return LL_RESERVED_SPACE(dst->dev); + + return skb->mac_len; +} + INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, diff --git a/include/net/genetlink.h b/include/net/genetlink.h index d096cc6352de..a03d56765832 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -354,7 +354,7 @@ __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * - * Returns pointer to new genetlink header. + * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) @@ -366,7 +366,7 @@ genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * - * Returns pointer to netlink header. + * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { @@ -435,7 +435,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, * @flags: netlink message flags * @cmd: generic netlink command * - * Returns pointer to user specific header + * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 56d8bc5593d3..3ccbad881d74 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,7 +172,7 @@ struct inet_cork { u8 tx_flags; __u8 ttl; __s16 tos; - char priority; + u32 priority; __u16 gso_size; u32 ts_opt_id; u64 transmit_time; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 62c0a7e65d6b..67a313575780 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -74,6 +74,10 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + /** + * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec. + */ + u32 tw_entry_stamp; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 74ff688568a0..f475757daafb 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -96,30 +96,28 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create); + const struct inetpeer_addr *daddr); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int vif, int create) + int vif) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, - const struct in6_addr *v6daddr, - int create) + const struct in6_addr *v6daddr) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, diff --git a/include/net/ip.h b/include/net/ip.h index 0e548c1f2a0e..9f5e33e371fc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -81,7 +81,6 @@ struct ipcm_cookie { __u8 protocol; __u8 ttl; __s16 tos; - char priority; __u16 gso_size; }; @@ -96,6 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); + ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 248bfb26e2af..f5c43ad1565e 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -471,7 +471,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. - * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index dd9e93c12260..9804fa5d9c67 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -202,7 +202,7 @@ struct iucv_handler { * * Registers a driver with IUCV. * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp); @@ -224,7 +224,7 @@ void iucv_unregister(struct iucv_handler *handle, int smp); * * Allocate a new path structure for use with iucv_connect. * - * Returns NULL if the memory allocation failed or a pointer to the + * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) @@ -260,7 +260,7 @@ static inline void iucv_path_free(struct iucv_path *path) * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); @@ -278,7 +278,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -292,7 +292,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); @@ -304,7 +304,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); @@ -315,7 +315,7 @@ int iucv_path_resume(struct iucv_path *path, u8 *userdata); * * This function terminates an IUCV path. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); @@ -327,7 +327,7 @@ int iucv_path_sever(struct iucv_path *path, u8 *userdata); * * Cancels a message you have sent. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); @@ -347,7 +347,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); @@ -367,7 +367,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, @@ -382,7 +382,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); @@ -399,7 +399,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); @@ -419,7 +419,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -439,7 +439,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -461,7 +461,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 5adf6fda11e8..06985530517b 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -49,7 +49,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); * * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * @@ -108,7 +108,7 @@ nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, * * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * diff --git a/include/net/netlink.h b/include/net/netlink.h index 39eaa6be6ca8..e015ffbed819 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -649,7 +649,7 @@ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * - * Returns the next netlink message in the message stream and + * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * @@ -676,7 +676,7 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -701,7 +701,7 @@ static inline int nla_parse(struct nlattr **tb, int maxtype, * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -726,7 +726,7 @@ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, @@ -833,7 +833,7 @@ nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) @@ -854,7 +854,7 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, @@ -877,7 +877,7 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, @@ -914,7 +914,7 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, * nlmsg_report - need to report back to application? * @nlh: netlink message header * - * Returns 1 if a report back to the application is requested. + * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { @@ -925,7 +925,7 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * - * Returns 0 if netlink message is NULL + * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { @@ -952,7 +952,7 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -971,7 +971,7 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) @@ -993,7 +993,7 @@ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, @@ -1050,7 +1050,7 @@ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * - * Returns a pointer to the current tail of the message. + * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { @@ -1276,7 +1276,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * - * Returns the next netlink attribute in the attribute stream and + * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) @@ -1292,7 +1292,7 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) @@ -2091,7 +2091,7 @@ static inline int nla_get_flag(const struct nlattr *nla) * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * - * Returns the number of milliseconds in jiffies. + * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { @@ -2183,7 +2183,7 @@ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) @@ -2204,7 +2204,7 @@ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { @@ -2219,7 +2219,7 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) * Corrects the container attribute header to include the all * appended attributes. * - * Returns the total data length of the skb. + * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { @@ -2252,7 +2252,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, @@ -2285,7 +2285,7 @@ nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * - * Return true if padding is needed to align the next attribute (nla_data()) to + * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) @@ -2312,7 +2312,7 @@ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * - * Returns zero on success or a negative error code. + * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d..1b58faa4f20f 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -72,6 +72,22 @@ static inline bool netmem_is_net_iov(const netmem_ref netmem) return (__force unsigned long)netmem & NET_IOV; } +/** + * __netmem_to_page - unsafely get pointer to the &page backing @netmem + * @netmem: netmem reference to convert + * + * Unsafe version of netmem_to_page(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB, no WARN). When @netmem points to IOV, + * provokes undefined behaviour. + * + * Return: pointer to the &page (garbage if @netmem is not page-backed). + */ +static inline struct page *__netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + /* This conversion fails (returns NULL) if the netmem_ref is not struct page * backed. */ @@ -80,7 +96,7 @@ static inline struct page *netmem_to_page(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return NULL; - return (__force struct page *)netmem; + return __netmem_to_page(netmem); } static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) @@ -103,6 +119,17 @@ static inline netmem_ref page_to_netmem(struct page *page) return (__force netmem_ref)page; } +/** + * virt_to_netmem - convert virtual memory pointer to a netmem reference + * @data: host memory pointer to convert + * + * Return: netmem reference to the &page backing this virtual address. + */ +static inline netmem_ref virt_to_netmem(const void *data) +{ + return page_to_netmem(virt_to_page(data)); +} + static inline int netmem_ref_count(netmem_ref netmem) { /* The non-pp refcount of net_iov is always 1. On net_iov, we only @@ -127,6 +154,22 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } +/** + * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem + * @netmem: netmem reference to get the pointer from + * + * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (avoids clearing the LSB). When @netmem points to IOV, + * provokes invalid memory access. + * + * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). + */ +static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) +{ + return __netmem_to_page(netmem)->pp; +} + static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { return __netmem_clear_lsb(netmem)->pp; @@ -158,12 +201,43 @@ static inline netmem_ref netmem_compound_head(netmem_ref netmem) return page_to_netmem(compound_head(netmem_to_page(netmem))); } +/** + * __netmem_address - unsafely get pointer to the memory backing @netmem + * @netmem: netmem reference to get the pointer for + * + * Unsafe version of netmem_address(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the memory (garbage if @netmem is not page-backed). + */ +static inline void *__netmem_address(netmem_ref netmem) +{ + return page_address(__netmem_to_page(netmem)); +} + static inline void *netmem_address(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return NULL; - return page_address(netmem_to_page(netmem)); + return __netmem_address(netmem); +} + +/** + * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure + * @netmem: netmem reference to check + * + * Return: true if @netmem is page-backed and the page was allocated under + * memory pressure, false otherwise. + */ +static inline bool netmem_is_pfmemalloc(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return false; + + return page_is_pfmemalloc(netmem_to_page(netmem)); } static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3c014170e001..46452da35206 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -175,6 +175,7 @@ struct netns_ipv4 { u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; + unsigned int sysctl_tcp_tw_reuse_delay; int sysctl_tcp_fin_timeout; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 793e6fd78bc5..776a3008ac28 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -104,8 +104,7 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) * * Get a page fragment from the page allocator or page_pool caches. * - * Return: - * Return allocated page fragment, otherwise return NULL. + * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, @@ -116,22 +115,22 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -static inline struct page *page_pool_alloc(struct page_pool *pool, - unsigned int *offset, - unsigned int *size, gfp_t gfp) +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page; + netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; - return page_pool_alloc_pages(pool, gfp); + return page_pool_alloc_netmems(pool, gfp); } - page = page_pool_alloc_frag(pool, offset, *size, gfp); - if (unlikely(!page)) - return NULL; + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize @@ -142,7 +141,14 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, pool->frag_offset = max_size; } - return page; + return netmem; +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** @@ -155,8 +161,7 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * - * Return: - * Return allocated page or page fragment, otherwise return NULL. + * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, @@ -190,8 +195,7 @@ static inline void *page_pool_alloc_va(struct page_pool *pool, * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * - * Return: - * Return the va for the allocated page or page fragment, otherwise return NULL. + * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) @@ -302,7 +306,7 @@ static inline void page_pool_ref_page(struct page *page) page_pool_ref_netmem(page_to_netmem(page)); } -static inline bool page_pool_is_last_ref(netmem_ref netmem) +static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; @@ -317,7 +321,7 @@ static inline void page_pool_put_netmem(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_ref(netmem)) + if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); @@ -418,7 +422,21 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); } /** @@ -437,10 +455,21 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { - dma_sync_single_range_for_cpu(pool->p.dev, - page_pool_get_dma_addr(page), - offset + pool->p.offset, dma_sync_size, - page_pool_get_dma_dir(pool)); + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe3..ed4cd114180a 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -164,7 +164,8 @@ struct page_pool { bool has_init_callback:1; /* slow::init_callback is set */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif @@ -242,7 +243,7 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, @@ -259,8 +260,7 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +272,7 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static inline void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { } #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index cf199af85c52..22c5ab4269d7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -319,7 +319,7 @@ tcf_exts_hw_stats_update(const struct tcf_exts *exts, * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * - * Returns true if at least one action is present. + * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { @@ -501,7 +501,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, * through all ematches respecting their logic relations returning * as soon as the result is obvious. * - * Returns 1 if the ematch tree as-one matches, no ematches are configured + * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 5d74fa7e694c..8074322dd636 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1245,6 +1245,14 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free, + enum skb_drop_reason reason) +{ + tcf_set_drop_reason(skb, reason); + return qdisc_drop(skb, sch, to_free); +} + static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..d4bdd3286e03 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -953,6 +953,7 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1814,13 +1815,15 @@ struct sockcm_cookie { u32 mark; u32 tsflags; u32 ts_opt_id; + u32 priority; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { - .tsflags = READ_ONCE(sk->sk_tsflags) + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), }; } @@ -2658,7 +2661,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK)) + (1UL << SOCK_RCVMARK) |\ + (1UL << SOCK_RCVPRIORITY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) diff --git a/include/net/tcp.h b/include/net/tcp.h index e9b37b76e894..5b2b04835688 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1817,7 +1817,7 @@ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * - * Returns 0 on success, error otherwise. + * Returns: 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** diff --git a/include/net/tls.h b/include/net/tls.h index 61fef2880114..857340338b69 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -59,6 +59,8 @@ struct tls_rec; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) +#define TLS_HANDSHAKE_KEYUPDATE 24 /* rfc8446 B.3: Key update */ + #define TLS_AAD_SPACE_SIZE 13 #define TLS_MAX_IV_SIZE 16 @@ -130,6 +132,7 @@ struct tls_sw_context_rx { u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; + bool key_update_pending; struct tls_strparser strp; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 33ba6fc151cf..2dd23ee2bacd 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,7 @@ struct vxlan_config { unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; }; enum { diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c91..d2089cfecefd 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -11,6 +11,8 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ +#include <net/page_pool/types.h> + /** * DOC: XDP RX-queue information * @@ -88,7 +90,7 @@ struct xdp_buff { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } @@ -103,7 +105,8 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline bool +xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -144,15 +147,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } -static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; @@ -165,24 +169,25 @@ out: struct xdp_frame { void *data; - u16 len; - u16 headroom; + u32 len; + u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +static __always_inline bool +xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -190,18 +195,16 @@ static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; - void *xa; - void *q[XDP_BULK_QUEUE_SIZE]; + netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { - /* bq->count will be zero'ed when bq->xa gets updated */ - bq->xa = NULL; + bq->count = 0; } static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -249,7 +252,8 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; @@ -260,7 +264,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -302,24 +306,33 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp); +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); -static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { - struct skb_shared_info *sinfo; + if (unlikely(!bq->count)) + return; + + page_pool_put_netmem_bulk(bq->q, bq->count); + bq->count = 0; +} + +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) +{ + const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) @@ -351,6 +364,38 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If the driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If the driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd9160..f3175a5d28f7 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -101,7 +101,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } @@ -143,7 +143,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) list_add_tail(&frag->list_node, &frag->pool->xskb_list); } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; @@ -200,7 +200,8 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) XDP_TXMD_FLAGS_CHECKSUM | \ 0) -static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } @@ -337,7 +338,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } @@ -360,7 +361,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bb03cee716b3..50779406bc2d 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; -}; +} __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) @@ -183,7 +183,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } -static inline bool xp_mb_desc(struct xdp_desc *desc) +static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 5d7ee2610728..8d22b2e98d48 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -22,6 +22,7 @@ TRACE_EVENT(fib6_table_lookup, __field( int, err ) __field( int, oif ) __field( int, iif ) + __field( u32, flowlabel ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) @@ -42,6 +43,7 @@ TRACE_EVENT(fib6_table_lookup, __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; + __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp)); __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; @@ -76,11 +78,11 @@ TRACE_EVENT(fib6_table_lookup, } ), - TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", + TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, - __entry->tos, __entry->scope, __entry->flags, - __entry->name, __entry->gw, __entry->err) + __entry->flowlabel, __entry->tos, __entry->scope, + __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d03e0bd8c028..2f119d18a061 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -117,8 +117,10 @@ #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ + EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ + EM(rxrpc_call_poke_rx_packet, "Rx-packet") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ EM(rxrpc_call_poke_start, "Start") \ EM(rxrpc_call_poke_timer, "Timer") \ @@ -127,9 +129,9 @@ #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ + EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ - EM(rxrpc_skb_get_last_nack, "GET last-nack") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ @@ -138,12 +140,12 @@ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ + EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ - EM(rxrpc_skb_put_last_nack, "PUT last-nack") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ @@ -282,6 +284,7 @@ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ @@ -292,7 +295,6 @@ #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ - EM(rxrpc_txqueue_dequeue, "DEQ") \ EM(rxrpc_txqueue_end, "END") \ EM(rxrpc_txqueue_queue, "QUE") \ EM(rxrpc_txqueue_queue_last, "QLS") \ @@ -300,6 +302,13 @@ EM(rxrpc_txqueue_rotate_last, "RLS") \ E_(rxrpc_txqueue_wait, "WAI") +#define rxrpc_txdata_traces \ + EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ + EM(rxrpc_txdata_new_data, " ") \ + EM(rxrpc_txdata_retransmit, " *RETRANS*") \ + EM(rxrpc_txdata_tlp_new_data, " *TLP-NEW*") \ + E_(rxrpc_txdata_tlp_retransmit, " *TLP-RETRANS*") + #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ EM(rxrpc_receive_front, "FRN") \ @@ -335,11 +344,10 @@ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ - EM(rxrpc_rtt_rx_other_ack, "OACK") \ + EM(rxrpc_rtt_rx_data_ack, "DACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ - EM(rxrpc_rtt_rx_ping_response, "PONG") \ - E_(rxrpc_rtt_rx_requested_ack, "RACK") + E_(rxrpc_rtt_rx_ping_response, "PONG") #define rxrpc_timer_traces \ EM(rxrpc_timer_trace_delayed_ack, "DelayAck ") \ @@ -347,11 +355,12 @@ EM(rxrpc_timer_trace_hard, "HardLimit") \ EM(rxrpc_timer_trace_idle, "IdleLimit") \ EM(rxrpc_timer_trace_keepalive, "KeepAlive") \ - EM(rxrpc_timer_trace_lost_ack, "LostAck ") \ EM(rxrpc_timer_trace_ping, "DelayPing") \ - EM(rxrpc_timer_trace_resend, "Resend ") \ - EM(rxrpc_timer_trace_resend_reset, "ResendRst") \ - E_(rxrpc_timer_trace_resend_tx, "ResendTx ") + EM(rxrpc_timer_trace_rack_off, "RACK-OFF ") \ + EM(rxrpc_timer_trace_rack_zwp, "RACK-ZWP ") \ + EM(rxrpc_timer_trace_rack_reo, "RACK-Reo ") \ + EM(rxrpc_timer_trace_rack_tlp_pto, "TLP-PTO ") \ + E_(rxrpc_timer_trace_rack_rto, "RTO ") #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ @@ -362,22 +371,24 @@ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ + EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ EM(rxrpc_propose_ack_processing_op, "ProcOp ") \ EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ + EM(rxrpc_propose_ack_retransmit, "Retrans") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") -#define rxrpc_congest_modes \ - EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ - EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ - EM(RXRPC_CALL_PACKET_LOSS, "PktLoss ") \ - E_(RXRPC_CALL_SLOW_START, "SlowStart") +#define rxrpc_ca_states \ + EM(RXRPC_CA_CONGEST_AVOIDANCE, "CongAvoid") \ + EM(RXRPC_CA_FAST_RETRANSMIT, "FastReTx ") \ + EM(RXRPC_CA_PACKET_LOSS, "PktLoss ") \ + E_(RXRPC_CA_SLOW_START, "SlowStart") #define rxrpc_congest_changes \ EM(rxrpc_cong_begin_retransmission, " Retrans") \ @@ -450,7 +461,7 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ - EM(rxrpc_reqack_already_on, "ALREADY-ON") \ + EM(rxrpc_reqack_app_stall, "APP-STALL ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ @@ -460,21 +471,60 @@ /* ---- Must update size of stat_why_req_ack[] if more are added! */ #define rxrpc_txbuf_traces \ - EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ - EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ + EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ - EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ - E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") + E_(rxrpc_txbuf_see_send_more, "SEE SEND+ ") + +#define rxrpc_tq_traces \ + EM(rxrpc_tq_alloc, "ALLOC") \ + EM(rxrpc_tq_cleaned, "CLEAN") \ + EM(rxrpc_tq_decant, "DCNT ") \ + EM(rxrpc_tq_decant_advance, "DCNT>") \ + EM(rxrpc_tq_queue, "QUEUE") \ + EM(rxrpc_tq_queue_dup, "QUE!!") \ + EM(rxrpc_tq_rotate, "ROT ") \ + EM(rxrpc_tq_rotate_and_free, "ROT-F") \ + EM(rxrpc_tq_rotate_and_keep, "ROT-K") \ + EM(rxrpc_tq_transmit, "XMIT ") \ + E_(rxrpc_tq_transmit_advance, "XMIT>") + +#define rxrpc_pmtud_reduce_traces \ + EM(rxrpc_pmtud_reduce_ack, "Ack ") \ + EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ + E_(rxrpc_pmtud_reduce_route, "Route") + +#define rxrpc_rotate_traces \ + EM(rxrpc_rotate_trace_hack, "hard-ack") \ + EM(rxrpc_rotate_trace_sack, "soft-ack") \ + E_(rxrpc_rotate_trace_snak, "soft-nack") + +#define rxrpc_rack_timer_modes \ + EM(RXRPC_CALL_RACKTIMER_OFF, "---") \ + EM(RXRPC_CALL_RACKTIMER_RACK_REORDER, "REO") \ + EM(RXRPC_CALL_RACKTIMER_TLP_PTO, "TLP") \ + E_(RXRPC_CALL_RACKTIMER_RTO, "RTO") + +#define rxrpc_tlp_probe_traces \ + EM(rxrpc_tlp_probe_trace_busy, "busy") \ + EM(rxrpc_tlp_probe_trace_transmit_new, "transmit-new") \ + E_(rxrpc_tlp_probe_trace_retransmit, "retransmit") + +#define rxrpc_tlp_ack_traces \ + EM(rxrpc_tlp_ack_trace_acked, "acked") \ + EM(rxrpc_tlp_ack_trace_dup_acked, "dup-acked") \ + EM(rxrpc_tlp_ack_trace_hard_beyond, "hard-beyond") \ + EM(rxrpc_tlp_ack_trace_incomplete, "incomplete") \ + E_(rxrpc_tlp_ack_trace_new_data, "new-data") /* * Generate enums for tracing information. @@ -496,18 +546,24 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); +enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); +enum rxrpc_rotate_trace { rxrpc_rotate_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); +enum rxrpc_tlp_ack_trace { rxrpc_tlp_ack_traces } __mode(byte); +enum rxrpc_tlp_probe_trace { rxrpc_tlp_probe_traces } __mode(byte); +enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); +enum rxrpc_txdata_trace { rxrpc_txdata_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -525,24 +581,31 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); rxrpc_abort_reasons; rxrpc_bundle_traces; +rxrpc_ca_states; rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; -rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; +rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; +rxrpc_rack_timer_modes; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; +rxrpc_rotate_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; +rxrpc_tlp_ack_traces; +rxrpc_tlp_probe_traces; +rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; +rxrpc_txdata_traces; rxrpc_txqueue_traces; /* @@ -581,6 +644,20 @@ TRACE_EVENT(rxrpc_local, __entry->usage) ); +TRACE_EVENT(rxrpc_iothread_rx, + TP_PROTO(struct rxrpc_local *local, unsigned int nr_rx), + TP_ARGS(local, nr_rx), + TP_STRUCT__entry( + __field(unsigned int, local) + __field(unsigned int, nr_rx) + ), + TP_fast_assign( + __entry->local = local->debug_id; + __entry->nr_rx = nr_rx; + ), + TP_printk("L=%08x nrx=%u", __entry->local, __entry->nr_rx) + ); + TRACE_EVENT(rxrpc_peer, TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), @@ -865,34 +942,101 @@ TRACE_EVENT(rxrpc_txqueue, TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) - __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) + __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_top) - __field(rxrpc_seq_t, tx_prepared) + __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) ), TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; - __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; + __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_top = call->tx_top; - __entry->tx_prepared = call->tx_prepared; + __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", + TP_printk("c=%08x %s b=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, __entry->acks_hard_ack, - __entry->tx_top - __entry->tx_bottom, + __entry->acks_hard_ack - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, - __entry->tx_prepared - __entry->tx_bottom, + __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); +TRACE_EVENT(rxrpc_transmit, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t send_top, int space), + + TP_ARGS(call, send_top, space), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(u16, space) + __field(u16, tx_winsize) + __field(u16, cong_cwnd) + __field(u16, cong_extra) + __field(u16, in_flight) + __field(u16, prepared) + __field(u16, pmtud_jumbo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = call->tx_top + 1; + __entry->space = space; + __entry->tx_winsize = call->tx_winsize; + __entry->cong_cwnd = call->cong_cwnd; + __entry->cong_extra = call->cong_extra; + __entry->prepared = send_top - call->tx_bottom; + __entry->in_flight = call->tx_top - call->tx_bottom; + __entry->pmtud_jumbo = call->peer->pmtud_jumbo; + ), + + TP_printk("c=%08x q=%08x sp=%u tw=%u cw=%u+%u pr=%u if=%u pj=%u", + __entry->call, + __entry->seq, + __entry->space, + __entry->tx_winsize, + __entry->cong_cwnd, + __entry->cong_extra, + __entry->prepared, + __entry->in_flight, + __entry->pmtud_jumbo) + ); + +TRACE_EVENT(rxrpc_tx_rotate, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_seq_t to), + + TP_ARGS(call, seq, to), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(rxrpc_seq_t, to) + __field(rxrpc_seq_t, top) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = seq; + __entry->to = to; + __entry->top = call->tx_top; + ), + + TP_printk("c=%08x q=%08x-%08x-%08x", + __entry->call, + __entry->seq, + __entry->to, + __entry->top) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), @@ -921,11 +1065,9 @@ TRACE_EVENT(rxrpc_rx_data, ); TRACE_EVENT(rxrpc_rx_ack, - TP_PROTO(struct rxrpc_call *call, - rxrpc_serial_t serial, rxrpc_serial_t ack_serial, - rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_skb_priv *sp), - TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks), + TP_ARGS(call, sp), TP_STRUCT__entry( __field(unsigned int, call) @@ -935,23 +1077,26 @@ TRACE_EVENT(rxrpc_rx_ack, __field(rxrpc_seq_t, prev) __field(u8, reason) __field(u8, n_acks) + __field(u8, user_status) ), TP_fast_assign( - __entry->call = call->debug_id; - __entry->serial = serial; - __entry->ack_serial = ack_serial; - __entry->first = first; - __entry->prev = prev; - __entry->reason = reason; - __entry->n_acks = n_acks; + __entry->call = call->debug_id; + __entry->serial = sp->hdr.serial; + __entry->user_status = sp->hdr.userStatus; + __entry->ack_serial = sp->ack.acked_serial; + __entry->first = sp->ack.first_ack; + __entry->prev = sp->ack.prev_ack; + __entry->reason = sp->ack.reason; + __entry->n_acks = sp->ack.nr_acks; ), - TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", + TP_printk("c=%08x %08x %s r=%08x us=%02x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_serial, + __entry->user_status, __entry->first, __entry->prev, __entry->n_acks) @@ -981,6 +1126,29 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_conn_abort, + TP_PROTO(const struct rxrpc_connection *conn, const struct sk_buff *skb), + + TP_ARGS(conn, skb), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, abort_code) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = rxrpc_skb(skb)->hdr.serial; + __entry->abort_code = skb->priority; + ), + + TP_printk("C=%08x ABORT %08x ac=%d", + __entry->conn, + __entry->serial, + __entry->abort_code) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), @@ -1102,9 +1270,10 @@ TRACE_EVENT(rxrpc_tx_packet, TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, unsigned int flags, bool lose), + rxrpc_serial_t serial, unsigned int flags, + enum rxrpc_txdata_trace trace), - TP_ARGS(call, seq, serial, flags, lose), + TP_ARGS(call, seq, serial, flags, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1113,7 +1282,7 @@ TRACE_EVENT(rxrpc_tx_data, __field(u32, cid) __field(u32, call_id) __field(u16, flags) - __field(bool, lose) + __field(enum rxrpc_txdata_trace, trace) ), TP_fast_assign( @@ -1123,26 +1292,26 @@ TRACE_EVENT(rxrpc_tx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; - __entry->lose = lose; + __entry->trace = trace; ), - TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s", + TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s", __entry->call, __entry->cid, __entry->call_id, __entry->serial, __entry->seq, __entry->flags & RXRPC_TXBUF_WIRE_FLAGS, - __entry->flags & RXRPC_TXBUF_RESENT ? " *RETRANS*" : "", - __entry->lose ? " *LOSE*" : "") + __print_symbolic(__entry->trace, rxrpc_txdata_traces)) ); TRACE_EVENT(rxrpc_tx_ack, TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, - u8 reason, u8 n_acks, u16 rwind), + u8 reason, u8 n_acks, u16 rwind, + enum rxrpc_propose_ack_trace trace), - TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind), + TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1152,6 +1321,7 @@ TRACE_EVENT(rxrpc_tx_ack, __field(u8, reason) __field(u8, n_acks) __field(u16, rwind) + __field(enum rxrpc_propose_ack_trace, trace) ), TP_fast_assign( @@ -1162,16 +1332,18 @@ TRACE_EVENT(rxrpc_tx_ack, __entry->reason = reason; __entry->n_acks = n_acks; __entry->rwind = rwind; + __entry->trace = trace; ), - TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u", + TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u %s", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_first, __entry->ack_serial, __entry->n_acks, - __entry->rwind) + __entry->rwind, + __print_symbolic(__entry->trace, rxrpc_propose_ack_traces)) ); TRACE_EVENT(rxrpc_receive, @@ -1296,9 +1468,9 @@ TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, - u32 rtt, u32 rto), + u32 rtt, u32 srtt, u32 rto), - TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto), + TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, srtt, rto), TP_STRUCT__entry( __field(unsigned int, call) @@ -1307,7 +1479,9 @@ TRACE_EVENT(rxrpc_rtt_rx, __field(rxrpc_serial_t, send_serial) __field(rxrpc_serial_t, resp_serial) __field(u32, rtt) + __field(u32, srtt) __field(u32, rto) + __field(u32, min_rtt) ), TP_fast_assign( @@ -1317,17 +1491,21 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; + __entry->srtt = srtt; __entry->rto = rto; + __entry->min_rtt = minmax_get(&call->min_rtt) ), - TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u", + TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, - __entry->rto) + __entry->srtt / 8, + __entry->rto, + __entry->min_rtt) ); TRACE_EVENT(rxrpc_timer_set, @@ -1544,112 +1722,125 @@ TRACE_EVENT(rxrpc_drop_ack, ); TRACE_EVENT(rxrpc_retransmit, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, ktime_t expiry), + TP_PROTO(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb), - TP_ARGS(call, seq, serial, expiry), + TP_ARGS(call, req, txb), TP_STRUCT__entry( __field(unsigned int, call) + __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) - __field(ktime_t, expiry) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->seq = seq; - __entry->serial = serial; - __entry->expiry = expiry; + __entry->qbase = req->tq->qbase; + __entry->seq = req->seq; + __entry->serial = txb->serial; ), - TP_printk("c=%08x q=%x r=%x xp=%lld", + TP_printk("c=%08x tq=%x q=%x r=%x", __entry->call, + __entry->qbase, __entry->seq, - __entry->serial, - ktime_to_us(__entry->expiry)) + __entry->serial) ); TRACE_EVENT(rxrpc_congest, - TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial, enum rxrpc_congest_change change), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), - TP_ARGS(call, summary, ack_serial, change), + TP_ARGS(call, summary), TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_change, change) + __field(enum rxrpc_ca_state, ca_state) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) - __field(rxrpc_serial_t, ack_serial) + __field(u16, nr_sacks) + __field(u16, nr_snacks) + __field(u16, cwnd) + __field(u16, ssthresh) + __field(u16, cumul_acks) + __field(u16, dup_acks) __field_struct(struct rxrpc_ack_summary, sum) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->change = change; + __entry->ca_state = call->cong_ca_state; __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; - __entry->ack_serial = ack_serial; + __entry->nr_sacks = call->acks_nr_sacks; + __entry->nr_snacks = call->acks_nr_snacks; + __entry->cwnd = call->cong_cwnd; + __entry->ssthresh = call->cong_ssthresh; + __entry->cumul_acks = call->cong_cumul_acks; + __entry->dup_acks = call->cong_dup_acks; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, - __entry->ack_serial, + __entry->sum.acked_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, - __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), - __entry->sum.cwnd, - __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.nr_retained_nacks, - __entry->sum.nr_new_acks, - __entry->sum.nr_new_nacks, + __print_symbolic(__entry->ca_state, rxrpc_ca_states), + __entry->cwnd, + __entry->ssthresh, + __entry->nr_sacks, __entry->sum.nr_new_sacks, + __entry->nr_snacks, __entry->sum.nr_new_snacks, + __entry->sum.nr_new_hacks, __entry->top - __entry->hard_ack, - __entry->sum.cumulative_acks, - __entry->sum.dup_acks, - __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "", - __print_symbolic(__entry->change, rxrpc_congest_changes), + __entry->cumul_acks, + __entry->dup_acks, + __entry->lowest_nak, __entry->sum.new_low_snack ? "!" : "", + __print_symbolic(__entry->sum.change, rxrpc_congest_changes), __entry->sum.retrans_timeo ? " rTxTo" : "") ); TRACE_EVENT(rxrpc_reset_cwnd, - TP_PROTO(struct rxrpc_call *call, ktime_t now), + TP_PROTO(struct rxrpc_call *call, ktime_t since_last_tx, ktime_t rtt), - TP_ARGS(call, now), + TP_ARGS(call, since_last_tx, rtt), TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_mode, mode) + __field(enum rxrpc_ca_state, ca_state) __field(unsigned short, cwnd) __field(unsigned short, extra) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prepared) __field(ktime_t, since_last_tx) + __field(ktime_t, rtt) __field(bool, has_data) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->mode = call->cong_mode; + __entry->ca_state = call->cong_ca_state; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; - __entry->prepared = call->tx_prepared - call->tx_bottom; - __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); - __entry->has_data = !list_empty(&call->tx_sendmsg); + __entry->prepared = call->send_top - call->tx_bottom; + __entry->since_last_tx = since_last_tx; + __entry->rtt = rtt; + __entry->has_data = call->tx_bottom != call->tx_top; ), - TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", + TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu/%llu d=%u", __entry->call, __entry->hard_ack, - __print_symbolic(__entry->mode, rxrpc_congest_modes), + __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, - ktime_to_ns(__entry->since_last_tx), + ktime_to_us(__entry->since_last_tx), + ktime_to_us(__entry->rtt), __entry->has_data) ); @@ -1722,10 +1913,36 @@ TRACE_EVENT(rxrpc_connect_call, &__entry->srx.transport) ); +TRACE_EVENT(rxrpc_apply_acks, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(unsigned int, nr_rep) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, acks) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->acks = tq->segment_acked; + __entry->nr_rep = tq->nr_reported_acks; + ), + + TP_printk("c=%08x tq=%x acks=%016lx rep=%u", + __entry->call, + __entry->qbase, + __entry->acks, + __entry->nr_rep) + ); + TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t ack_serial), - TP_ARGS(call, ack), + TP_ARGS(call, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) @@ -1735,11 +1952,10 @@ TRACE_EVENT(rxrpc_resend, ), TP_fast_assign( - struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; - __entry->ack_serial = sp ? sp->hdr.serial : 0; + __entry->ack_serial = ack_serial; ), TP_printk("c=%08x r=%x q=%x tq=%x", @@ -1749,6 +1965,63 @@ TRACE_EVENT(rxrpc_resend, __entry->transmitted) ); +TRACE_EVENT(rxrpc_resend_lost, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned long lost), + + TP_ARGS(call, tq, lost), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(u8, nr_rep) + __field(unsigned long, lost) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nr_rep = tq->nr_reported_acks; + __entry->lost = lost; + ), + + TP_printk("c=%08x tq=%x lost=%016lx nr=%u", + __entry->call, + __entry->qbase, + __entry->lost, + __entry->nr_rep) + ); + +TRACE_EVENT(rxrpc_rotate, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, + enum rxrpc_rotate_trace trace), + + TP_ARGS(call, tq, summary, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(unsigned int, nr_rep) + __field(enum rxrpc_rotate_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->seq = seq; + __entry->nr_rep = tq->nr_reported_acks; + __entry->trace = trace; + ), + + TP_printk("c=%08x tq=%x q=%x nr=%x %s", + __entry->call, + __entry->qbase, + __entry->seq, + __entry->nr_rep, + __print_symbolic(__entry->trace, rxrpc_rotate_traces)) + ); + TRACE_EVENT(rxrpc_rx_icmp, TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee, struct sockaddr_rxrpc *srx), @@ -1858,38 +2131,36 @@ TRACE_EVENT(rxrpc_notify_socket, ); TRACE_EVENT(rxrpc_rx_discard_ack, - TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, - rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, - rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt), - TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, - prev_pkt, call_ackr_prev), + TP_ARGS(call, serial, hard_ack, prev_pkt), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) - __field(rxrpc_seq_t, first_soft_ack) - __field(rxrpc_seq_t, call_ackr_first) + __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prev_pkt) - __field(rxrpc_seq_t, call_ackr_prev) + __field(rxrpc_seq_t, acks_hard_ack) + __field(rxrpc_seq_t, acks_prev_seq) ), TP_fast_assign( - __entry->debug_id = debug_id; + __entry->debug_id = call->debug_id; __entry->serial = serial; - __entry->first_soft_ack = first_soft_ack; - __entry->call_ackr_first = call_ackr_first; + __entry->hard_ack = hard_ack; __entry->prev_pkt = prev_pkt; - __entry->call_ackr_prev = call_ackr_prev; + __entry->acks_hard_ack = call->acks_hard_ack; + __entry->acks_prev_seq = call->acks_prev_seq; ), TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", __entry->debug_id, __entry->serial, - __entry->first_soft_ack, - __entry->call_ackr_first, + __entry->hard_ack, + __entry->acks_hard_ack, __entry->prev_pkt, - __entry->call_ackr_prev) + __entry->acks_prev_seq) ); TRACE_EVENT(rxrpc_req_ack, @@ -1947,6 +2218,33 @@ TRACE_EVENT(rxrpc_txbuf, __entry->ref) ); +TRACE_EVENT(rxrpc_tq, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + rxrpc_seq_t seq, enum rxrpc_tq_trace trace), + + TP_ARGS(call, tq, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tq_trace, trace) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + __entry->qbase = tq ? tq->qbase : call->tx_qbase; + __entry->seq = seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x bq=%08x q=%08x %s", + __entry->call_debug_id, + __entry->qbase, + __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tq_traces)) + ); + TRACE_EVENT(rxrpc_poke_call, TP_PROTO(struct rxrpc_call *call, bool busy, enum rxrpc_call_poke_trace what), @@ -2015,6 +2313,360 @@ TRACE_EVENT(rxrpc_sack, __entry->sack) ); +TRACE_EVENT(rxrpc_pmtud_tx, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(unsigned short, pmtud_trial) + __field(unsigned short, pmtud_good) + __field(unsigned short, pmtud_bad) + ), + + TP_fast_assign( + __entry->peer_debug_id = call->peer->debug_id; + __entry->call_debug_id = call->debug_id; + __entry->ping_serial = call->conn->pmtud_probe; + __entry->pmtud_trial = call->peer->pmtud_trial; + __entry->pmtud_good = call->peer->pmtud_good; + __entry->pmtud_bad = call->peer->pmtud_bad; + ), + + TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->pmtud_good, + __entry->pmtud_trial, + __entry->pmtud_bad) + ); + +TRACE_EVENT(rxrpc_pmtud_rx, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + __field(unsigned short, max_data) + __field(u8, jumbo_max) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + __entry->max_data = conn->peer->max_data; + __entry->jumbo_max = conn->peer->pmtud_jumbo; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial, + __entry->max_data, + __entry->jumbo_max) + ); + +TRACE_EVENT(rxrpc_pmtud_lost, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial) + ); + +TRACE_EVENT(rxrpc_pmtud_reduce, + TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, + unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), + + TP_ARGS(peer, serial, max_data, reason), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(rxrpc_serial_t, serial) + __field(unsigned int, max_data) + __field(enum rxrpc_pmtud_reduce_trace, reason) + ), + + TP_fast_assign( + __entry->peer_debug_id = peer->debug_id; + __entry->serial = serial; + __entry->max_data = max_data; + __entry->reason = reason; + ), + + TP_printk("P=%08x %s r=%08x m=%u", + __entry->peer_debug_id, + __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), + __entry->serial, __entry->max_data) + ); + +TRACE_EVENT(rxrpc_rack, + TP_PROTO(struct rxrpc_call *call, ktime_t timo), + + TP_ARGS(call, timo), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_rack_timer_mode, mode) + __field(unsigned short, nr_sent) + __field(unsigned short, nr_lost) + __field(unsigned short, nr_resent) + __field(unsigned short, nr_sacked) + __field(ktime_t, timo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->mode = call->rack_timer_mode; + __entry->nr_sent = call->tx_nr_sent; + __entry->nr_lost = call->tx_nr_lost; + __entry->nr_resent = call->tx_nr_resent; + __entry->nr_sacked = call->acks_nr_sacks; + __entry->timo = timo; + ), + + TP_printk("c=%08x r=%08x q=%08x %s slrs=%u,%u,%u,%u t=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + __entry->nr_sent, __entry->nr_lost, + __entry->nr_resent, __entry->nr_sacked, + ktime_to_us(__entry->timo)) + ); + +TRACE_EVENT(rxrpc_rack_update, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), + + TP_ARGS(call, summary), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(int, xmit_ts) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->xmit_ts = ktime_sub(call->acks_latest_ts, call->rack_xmit_ts); + ), + + TP_printk("c=%08x r=%08x q=%08x xt=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + ktime_to_us(__entry->xmit_ts)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(ktime_t, rack_rtt) + __field(ktime_t, rack_reo_wnd) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->rack_rtt = call->rack_rtt; + __entry->rack_reo_wnd = call->rack_reo_wnd; + ), + + TP_printk("c=%08x rtt=%lld reow=%lld", + __entry->call, ktime_to_us(__entry->rack_rtt), + ktime_to_us(__entry->rack_reo_wnd)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq, + unsigned long nacks), + + TP_ARGS(call, tq, nacks), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, nacks) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nacks = nacks; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x q=%08x n=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, + __entry->nacks, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_rack_detect_loss, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + rxrpc_seq_t seq), + + TP_ARGS(call, summary, seq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = seq; + ), + + TP_printk("c=%08x r=%08x q=%08x", + __entry->call, __entry->ack_serial, __entry->seq) + ); + +TRACE_EVENT(rxrpc_rack_mark_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, trans) + __field(unsigned long, acked) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->trans = call->tx_transmitted; + __entry->acked = tq->segment_acked; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x tq=%08x txq=%08x a=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, __entry->trans, + __entry->acked, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_tlp_probe, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_tlp_probe_trace trace), + + TP_ARGS(call, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tlp_probe_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->seq = call->tlp_seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x %s", + __entry->call, __entry->serial, __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tlp_probe_traces)) + ); + +TRACE_EVENT(rxrpc_tlp_ack, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + enum rxrpc_tlp_ack_trace trace), + + TP_ARGS(call, summary, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, tlp_seq) + __field(rxrpc_seq_t, hard_ack) + __field(enum rxrpc_tlp_ack_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->tlp_seq = call->tlp_seq; + __entry->hard_ack = call->acks_hard_ack; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x hq=%08x %s", + __entry->call, __entry->serial, + __entry->tlp_seq, __entry->hard_ack, + __print_symbolic(__entry->trace, rxrpc_tlp_ack_traces)) + ); + +TRACE_EVENT(rxrpc_rack_timer, + TP_PROTO(struct rxrpc_call *call, ktime_t delay, bool exp), + + TP_ARGS(call, delay, exp), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(bool, exp) + __field(enum rxrpc_rack_timer_mode, mode) + __field(ktime_t, delay) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->exp = exp; + __entry->mode = call->rack_timer_mode; + __entry->delay = delay; + ), + + TP_printk("c=%08x %s %s to=%lld", + __entry->call, + __entry->exp ? "Exp" : "Set", + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + ktime_to_us(__entry->delay)) + ); + #undef EM #undef E_ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index deacfd6dd197..aa5016ff3d91 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -143,6 +143,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 283305f6b063..9c909ce733a5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -10,545 +10,12 @@ #define _UAPI_LINUX_ETHTOOL_NETLINK_H_ #include <linux/ethtool.h> - -/* message types - userspace to kernel */ -enum { - ETHTOOL_MSG_USER_NONE, - ETHTOOL_MSG_STRSET_GET, - ETHTOOL_MSG_LINKINFO_GET, - ETHTOOL_MSG_LINKINFO_SET, - ETHTOOL_MSG_LINKMODES_GET, - ETHTOOL_MSG_LINKMODES_SET, - ETHTOOL_MSG_LINKSTATE_GET, - ETHTOOL_MSG_DEBUG_GET, - ETHTOOL_MSG_DEBUG_SET, - ETHTOOL_MSG_WOL_GET, - ETHTOOL_MSG_WOL_SET, - ETHTOOL_MSG_FEATURES_GET, - ETHTOOL_MSG_FEATURES_SET, - ETHTOOL_MSG_PRIVFLAGS_GET, - ETHTOOL_MSG_PRIVFLAGS_SET, - ETHTOOL_MSG_RINGS_GET, - ETHTOOL_MSG_RINGS_SET, - ETHTOOL_MSG_CHANNELS_GET, - ETHTOOL_MSG_CHANNELS_SET, - ETHTOOL_MSG_COALESCE_GET, - ETHTOOL_MSG_COALESCE_SET, - ETHTOOL_MSG_PAUSE_GET, - ETHTOOL_MSG_PAUSE_SET, - ETHTOOL_MSG_EEE_GET, - ETHTOOL_MSG_EEE_SET, - ETHTOOL_MSG_TSINFO_GET, - ETHTOOL_MSG_CABLE_TEST_ACT, - ETHTOOL_MSG_CABLE_TEST_TDR_ACT, - ETHTOOL_MSG_TUNNEL_INFO_GET, - ETHTOOL_MSG_FEC_GET, - ETHTOOL_MSG_FEC_SET, - ETHTOOL_MSG_MODULE_EEPROM_GET, - ETHTOOL_MSG_STATS_GET, - ETHTOOL_MSG_PHC_VCLOCKS_GET, - ETHTOOL_MSG_MODULE_GET, - ETHTOOL_MSG_MODULE_SET, - ETHTOOL_MSG_PSE_GET, - ETHTOOL_MSG_PSE_SET, - ETHTOOL_MSG_RSS_GET, - ETHTOOL_MSG_PLCA_GET_CFG, - ETHTOOL_MSG_PLCA_SET_CFG, - ETHTOOL_MSG_PLCA_GET_STATUS, - ETHTOOL_MSG_MM_GET, - ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_MODULE_FW_FLASH_ACT, - ETHTOOL_MSG_PHY_GET, - - /* add new constants above here */ - __ETHTOOL_MSG_USER_CNT, - ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 -}; - -/* message types - kernel to userspace */ -enum { - ETHTOOL_MSG_KERNEL_NONE, - ETHTOOL_MSG_STRSET_GET_REPLY, - ETHTOOL_MSG_LINKINFO_GET_REPLY, - ETHTOOL_MSG_LINKINFO_NTF, - ETHTOOL_MSG_LINKMODES_GET_REPLY, - ETHTOOL_MSG_LINKMODES_NTF, - ETHTOOL_MSG_LINKSTATE_GET_REPLY, - ETHTOOL_MSG_DEBUG_GET_REPLY, - ETHTOOL_MSG_DEBUG_NTF, - ETHTOOL_MSG_WOL_GET_REPLY, - ETHTOOL_MSG_WOL_NTF, - ETHTOOL_MSG_FEATURES_GET_REPLY, - ETHTOOL_MSG_FEATURES_SET_REPLY, - ETHTOOL_MSG_FEATURES_NTF, - ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, - ETHTOOL_MSG_PRIVFLAGS_NTF, - ETHTOOL_MSG_RINGS_GET_REPLY, - ETHTOOL_MSG_RINGS_NTF, - ETHTOOL_MSG_CHANNELS_GET_REPLY, - ETHTOOL_MSG_CHANNELS_NTF, - ETHTOOL_MSG_COALESCE_GET_REPLY, - ETHTOOL_MSG_COALESCE_NTF, - ETHTOOL_MSG_PAUSE_GET_REPLY, - ETHTOOL_MSG_PAUSE_NTF, - ETHTOOL_MSG_EEE_GET_REPLY, - ETHTOOL_MSG_EEE_NTF, - ETHTOOL_MSG_TSINFO_GET_REPLY, - ETHTOOL_MSG_CABLE_TEST_NTF, - ETHTOOL_MSG_CABLE_TEST_TDR_NTF, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, - ETHTOOL_MSG_FEC_GET_REPLY, - ETHTOOL_MSG_FEC_NTF, - ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, - ETHTOOL_MSG_STATS_GET_REPLY, - ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, - ETHTOOL_MSG_MODULE_GET_REPLY, - ETHTOOL_MSG_MODULE_NTF, - ETHTOOL_MSG_PSE_GET_REPLY, - ETHTOOL_MSG_RSS_GET_REPLY, - ETHTOOL_MSG_PLCA_GET_CFG_REPLY, - ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, - ETHTOOL_MSG_PLCA_NTF, - ETHTOOL_MSG_MM_GET_REPLY, - ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_MODULE_FW_FLASH_NTF, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_MSG_PHY_NTF, - - /* add new constants above here */ - __ETHTOOL_MSG_KERNEL_CNT, - ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 -}; - -/* request header */ - -enum ethtool_header_flags { - ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ - ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ - ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ -}; +#include <linux/ethtool_netlink_generated.h> #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY | \ ETHTOOL_FLAG_STATS) -enum { - ETHTOOL_A_HEADER_UNSPEC, - ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ - ETHTOOL_A_HEADER_DEV_NAME, /* string */ - ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_HEADER_CNT, - ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 -}; - -/* bit sets */ - -enum { - ETHTOOL_A_BITSET_BIT_UNSPEC, - ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ - ETHTOOL_A_BITSET_BIT_NAME, /* string */ - ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BIT_CNT, - ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_BITS_UNSPEC, - ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BITS_CNT, - ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_UNSPEC, - ETHTOOL_A_BITSET_NOMASK, /* flag */ - ETHTOOL_A_BITSET_SIZE, /* u32 */ - ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ - ETHTOOL_A_BITSET_VALUE, /* binary */ - ETHTOOL_A_BITSET_MASK, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_CNT, - ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 -}; - -/* string sets */ - -enum { - ETHTOOL_A_STRING_UNSPEC, - ETHTOOL_A_STRING_INDEX, /* u32 */ - ETHTOOL_A_STRING_VALUE, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_STRING_CNT, - ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGS_UNSPEC, - ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGS_CNT, - ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSET_UNSPEC, - ETHTOOL_A_STRINGSET_ID, /* u32 */ - ETHTOOL_A_STRINGSET_COUNT, /* u32 */ - ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSET_CNT, - ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSETS_UNSPEC, - ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSETS_CNT, - ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 -}; - -/* STRSET */ - -enum { - ETHTOOL_A_STRSET_UNSPEC, - ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ - ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_STRSET_CNT, - ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 -}; - -/* LINKINFO */ - -enum { - ETHTOOL_A_LINKINFO_UNSPEC, - ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKINFO_PORT, /* u8 */ - ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ - ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKINFO_CNT, - ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 -}; - -/* LINKMODES */ - -enum { - ETHTOOL_A_LINKMODES_UNSPEC, - ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ - ETHTOOL_A_LINKMODES_OURS, /* bitset */ - ETHTOOL_A_LINKMODES_PEER, /* bitset */ - ETHTOOL_A_LINKMODES_SPEED, /* u32 */ - ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ - ETHTOOL_A_LINKMODES_LANES, /* u32 */ - ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKMODES_CNT, - ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 -}; - -/* LINKSTATE */ - -enum { - ETHTOOL_A_LINKSTATE_UNSPEC, - ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKSTATE_LINK, /* u8 */ - ETHTOOL_A_LINKSTATE_SQI, /* u32 */ - ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ - ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKSTATE_CNT, - ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 -}; - -/* DEBUG */ - -enum { - ETHTOOL_A_DEBUG_UNSPEC, - ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_DEBUG_CNT, - ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 -}; - -/* WOL */ - -enum { - ETHTOOL_A_WOL_UNSPEC, - ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_WOL_MODES, /* bitset */ - ETHTOOL_A_WOL_SOPASS, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_WOL_CNT, - ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 -}; - -/* FEATURES */ - -enum { - ETHTOOL_A_FEATURES_UNSPEC, - ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEATURES_HW, /* bitset */ - ETHTOOL_A_FEATURES_WANTED, /* bitset */ - ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ - ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_FEATURES_CNT, - ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 -}; - -/* PRIVFLAGS */ - -enum { - ETHTOOL_A_PRIVFLAGS_UNSPEC, - ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_PRIVFLAGS_CNT, - ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 -}; - -/* RINGS */ - -enum { - ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, - ETHTOOL_TCP_DATA_SPLIT_DISABLED, - ETHTOOL_TCP_DATA_SPLIT_ENABLED, -}; - -enum { - ETHTOOL_A_RINGS_UNSPEC, - ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_RINGS_RX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ - ETHTOOL_A_RINGS_TX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ - ETHTOOL_A_RINGS_TX, /* u32 */ - ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ - ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_RINGS_CNT, - ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) -}; - -/* CHANNELS */ - -enum { - ETHTOOL_A_CHANNELS_UNSPEC, - ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_CHANNELS_CNT, - ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) -}; - -/* COALESCE */ - -enum { - ETHTOOL_A_COALESCE_UNSPEC, - ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ - ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_RX_PROFILE, - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_TX_PROFILE, - - /* add new constants above here */ - __ETHTOOL_A_COALESCE_CNT, - ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) -}; - -enum { - ETHTOOL_A_PROFILE_UNSPEC, - /* nest, _A_IRQ_MODERATION_* */ - ETHTOOL_A_PROFILE_IRQ_MODERATION, - __ETHTOOL_A_PROFILE_CNT, - ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) -}; - -enum { - ETHTOOL_A_IRQ_MODERATION_UNSPEC, - ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ - - __ETHTOOL_A_IRQ_MODERATION_CNT, - ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) -}; - -/* PAUSE */ - -enum { - ETHTOOL_A_PAUSE_UNSPEC, - ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ - ETHTOOL_A_PAUSE_RX, /* u8 */ - ETHTOOL_A_PAUSE_TX, /* u8 */ - ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ - ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PAUSE_CNT, - ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) -}; - -enum { - ETHTOOL_A_PAUSE_STAT_UNSPEC, - ETHTOOL_A_PAUSE_STAT_PAD, - - ETHTOOL_A_PAUSE_STAT_TX_FRAMES, - ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - - /* add new constants above here - * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! - */ - __ETHTOOL_A_PAUSE_STAT_CNT, - ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) -}; - -/* EEE */ - -enum { - ETHTOOL_A_EEE_UNSPEC, - ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_EEE_MODES_OURS, /* bitset */ - ETHTOOL_A_EEE_MODES_PEER, /* bitset */ - ETHTOOL_A_EEE_ACTIVE, /* u8 */ - ETHTOOL_A_EEE_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_EEE_CNT, - ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) -}; - -/* TSINFO */ - -enum { - ETHTOOL_A_TSINFO_UNSPEC, - ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ - ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ - ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ - ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ - ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ - - /* add new constants above here */ - __ETHTOOL_A_TSINFO_CNT, - ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) -}; - -enum { - ETHTOOL_A_TS_STAT_UNSPEC, - - ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ - ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ - ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_TS_STAT_CNT, - ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) - -}; - -/* PHC VCLOCKS */ - -enum { - ETHTOOL_A_PHC_VCLOCKS_UNSPEC, - ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ - ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ - - /* add new constants above here */ - __ETHTOOL_A_PHC_VCLOCKS_CNT, - ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) -}; - -/* CABLE TEST */ - -enum { - ETHTOOL_A_CABLE_TEST_UNSPEC, - ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_CNT, - ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 -}; - /* CABLE TEST NOTIFY */ enum { ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC, @@ -583,73 +50,11 @@ enum { }; enum { - ETHTOOL_A_CABLE_RESULT_UNSPEC, - ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ - ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_RESULT_CNT, - ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, - ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ - ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, - ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) -}; - -enum { ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC, ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED, ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED }; -enum { - ETHTOOL_A_CABLE_NEST_UNSPEC, - ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ - ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ - __ETHTOOL_A_CABLE_NEST_CNT, - ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ - - __ETHTOOL_A_CABLE_TEST_NTF_CNT, - ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) -}; - -/* CABLE TEST TDR */ - -enum { - ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, - ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 -}; - -enum { - ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CNT, - ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 -}; - /* CABLE TEST TDR NOTIFY */ enum { @@ -690,132 +95,6 @@ enum { }; enum { - ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, - ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 -}; - -/* TUNNEL INFO */ - -enum { - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, - ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, - - __ETHTOOL_UDP_TUNNEL_TYPE_CNT -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ - ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, - ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ - ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ - ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, - ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_CNT, - ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_INFO_UNSPEC, - ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_INFO_CNT, - ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) -}; - -/* FEC */ - -enum { - ETHTOOL_A_FEC_UNSPEC, - ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEC_MODES, /* bitset */ - ETHTOOL_A_FEC_AUTO, /* u8 */ - ETHTOOL_A_FEC_ACTIVE, /* u32 */ - ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ - - __ETHTOOL_A_FEC_CNT, - ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) -}; - -enum { - ETHTOOL_A_FEC_STAT_UNSPEC, - ETHTOOL_A_FEC_STAT_PAD, - - ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ - ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ - ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ - - /* add new constants above here */ - __ETHTOOL_A_FEC_STAT_CNT, - ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) -}; - -/* MODULE EEPROM */ - -enum { - ETHTOOL_A_MODULE_EEPROM_UNSPEC, - ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ - - __ETHTOOL_A_MODULE_EEPROM_CNT, - ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) -}; - -/* STATS */ - -enum { - ETHTOOL_A_STATS_UNSPEC, - ETHTOOL_A_STATS_PAD, - ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STATS_GROUPS, /* bitset */ - - ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ - - ETHTOOL_A_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_CNT, - ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) -}; - -enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, ETHTOOL_STATS_ETH_CTRL, @@ -826,27 +105,6 @@ enum { }; enum { - ETHTOOL_A_STATS_GRP_UNSPEC, - ETHTOOL_A_STATS_GRP_PAD, - - ETHTOOL_A_STATS_GRP_ID, /* u32 */ - ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ - - ETHTOOL_A_STATS_GRP_STAT, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ - ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_GRP_CNT, - ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) -}; - -enum { /* 30.3.2.1.5 aSymbolErrorDuringCarrier */ ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, @@ -935,155 +193,6 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; -/* MODULE */ - -enum { - ETHTOOL_A_MODULE_UNSPEC, - ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ - ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_CNT, - ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) -}; - -/* Power Sourcing Equipment */ -enum { - ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, - ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ -}; - -enum { - ETHTOOL_A_PSE_UNSPEC, - ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ - ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ - ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_PSE_CNT, - ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) -}; - -enum { - ETHTOOL_A_RSS_UNSPEC, - ETHTOOL_A_RSS_HEADER, - ETHTOOL_A_RSS_CONTEXT, /* u32 */ - ETHTOOL_A_RSS_HFUNC, /* u32 */ - ETHTOOL_A_RSS_INDIR, /* binary */ - ETHTOOL_A_RSS_HKEY, /* binary */ - ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ - ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ - - __ETHTOOL_A_RSS_CNT, - ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), -}; - -/* PLCA */ - -enum { - ETHTOOL_A_PLCA_UNSPEC, - ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PLCA_VERSION, /* u16 */ - ETHTOOL_A_PLCA_ENABLED, /* u8 */ - ETHTOOL_A_PLCA_STATUS, /* u8 */ - ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ - ETHTOOL_A_PLCA_NODE_ID, /* u32 */ - ETHTOOL_A_PLCA_TO_TMR, /* u32 */ - ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ - ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PLCA_CNT, - ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) -}; - -/* MAC Merge (802.3) */ - -enum { - ETHTOOL_A_MM_STAT_UNSPEC, - ETHTOOL_A_MM_STAT_PAD, - - /* aMACMergeFrameAssErrorCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ - /* aMACMergeFrameSmdErrorCount */ - ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ - /* aMACMergeFrameAssOkCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ - /* aMACMergeFragCountRx */ - ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ - /* aMACMergeFragCountTx */ - ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ - /* aMACMergeHoldCount */ - ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_MM_STAT_CNT, - ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) -}; - -enum { - ETHTOOL_A_MM_UNSPEC, - ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ - ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ - ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ - ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ - - /* add new constants above here */ - __ETHTOOL_A_MM_CNT, - ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) -}; - -/* MODULE_FW_FLASH */ - -enum { - ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, - ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ - ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_FW_FLASH_CNT, - ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) -}; - -enum { - ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_PHY_CNT, - ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) -}; /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h new file mode 100644 index 000000000000..43993a2d68e5 --- /dev/null +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -0,0 +1,818 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ethtool.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H +#define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H + +#define ETHTOOL_FAMILY_NAME "ethtool" +#define ETHTOOL_FAMILY_VERSION 1 + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + /* private: */ + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + ETHTOOL_UDP_TUNNEL_TYPE_MAX = (__ETHTOOL_UDP_TUNNEL_TYPE_CNT - 1) +}; + +/** + * enum ethtool_header_flags - common ethtool header flags + * @ETHTOOL_FLAG_COMPACT_BITSETS: use compact bitsets in reply + * @ETHTOOL_FLAG_OMIT_REPLY: provide optional reply for SET or ACT requests + * @ETHTOOL_FLAG_STATS: request statistics, if supported by the driver + */ +enum ethtool_header_flags { + ETHTOOL_FLAG_COMPACT_BITSETS = 1, + ETHTOOL_FLAG_OMIT_REPLY = 2, + ETHTOOL_FLAG_STATS = 4, +}; + +enum { + ETHTOOL_PHY_UPSTREAM_TYPE_MAC, + ETHTOOL_PHY_UPSTREAM_TYPE_PHY, +}; + +enum ethtool_tcp_data_split { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN, + ETHTOOL_TCP_DATA_SPLIT_DISABLED, + ETHTOOL_TCP_DATA_SPLIT_ENABLED, +}; + +enum { + ETHTOOL_A_HEADER_UNSPEC, + ETHTOOL_A_HEADER_DEV_INDEX, + ETHTOOL_A_HEADER_DEV_NAME, + ETHTOOL_A_HEADER_FLAGS, + ETHTOOL_A_HEADER_PHY_INDEX, + + __ETHTOOL_A_HEADER_CNT, + ETHTOOL_A_HEADER_MAX = (__ETHTOOL_A_HEADER_CNT - 1) +}; + +enum { + ETHTOOL_A_BITSET_BIT_UNSPEC, + ETHTOOL_A_BITSET_BIT_INDEX, + ETHTOOL_A_BITSET_BIT_NAME, + ETHTOOL_A_BITSET_BIT_VALUE, + + __ETHTOOL_A_BITSET_BIT_CNT, + ETHTOOL_A_BITSET_BIT_MAX = (__ETHTOOL_A_BITSET_BIT_CNT - 1) +}; + +enum { + ETHTOOL_A_BITSET_BITS_UNSPEC, + ETHTOOL_A_BITSET_BITS_BIT, + + __ETHTOOL_A_BITSET_BITS_CNT, + ETHTOOL_A_BITSET_BITS_MAX = (__ETHTOOL_A_BITSET_BITS_CNT - 1) +}; + +enum { + ETHTOOL_A_BITSET_UNSPEC, + ETHTOOL_A_BITSET_NOMASK, + ETHTOOL_A_BITSET_SIZE, + ETHTOOL_A_BITSET_BITS, + ETHTOOL_A_BITSET_VALUE, + ETHTOOL_A_BITSET_MASK, + + __ETHTOOL_A_BITSET_CNT, + ETHTOOL_A_BITSET_MAX = (__ETHTOOL_A_BITSET_CNT - 1) +}; + +enum { + ETHTOOL_A_STRING_UNSPEC, + ETHTOOL_A_STRING_INDEX, + ETHTOOL_A_STRING_VALUE, + + __ETHTOOL_A_STRING_CNT, + ETHTOOL_A_STRING_MAX = (__ETHTOOL_A_STRING_CNT - 1) +}; + +enum { + ETHTOOL_A_STRINGS_UNSPEC, + ETHTOOL_A_STRINGS_STRING, + + __ETHTOOL_A_STRINGS_CNT, + ETHTOOL_A_STRINGS_MAX = (__ETHTOOL_A_STRINGS_CNT - 1) +}; + +enum { + ETHTOOL_A_STRINGSET_UNSPEC, + ETHTOOL_A_STRINGSET_ID, + ETHTOOL_A_STRINGSET_COUNT, + ETHTOOL_A_STRINGSET_STRINGS, + + __ETHTOOL_A_STRINGSET_CNT, + ETHTOOL_A_STRINGSET_MAX = (__ETHTOOL_A_STRINGSET_CNT - 1) +}; + +enum { + ETHTOOL_A_STRINGSETS_UNSPEC, + ETHTOOL_A_STRINGSETS_STRINGSET, + + __ETHTOOL_A_STRINGSETS_CNT, + ETHTOOL_A_STRINGSETS_MAX = (__ETHTOOL_A_STRINGSETS_CNT - 1) +}; + +enum { + ETHTOOL_A_STRSET_UNSPEC, + ETHTOOL_A_STRSET_HEADER, + ETHTOOL_A_STRSET_STRINGSETS, + ETHTOOL_A_STRSET_COUNTS_ONLY, + + __ETHTOOL_A_STRSET_CNT, + ETHTOOL_A_STRSET_MAX = (__ETHTOOL_A_STRSET_CNT - 1) +}; + +enum { + ETHTOOL_A_PRIVFLAGS_UNSPEC, + ETHTOOL_A_PRIVFLAGS_HEADER, + ETHTOOL_A_PRIVFLAGS_FLAGS, + + __ETHTOOL_A_PRIVFLAGS_CNT, + ETHTOOL_A_PRIVFLAGS_MAX = (__ETHTOOL_A_PRIVFLAGS_CNT - 1) +}; + +enum { + ETHTOOL_A_RINGS_UNSPEC, + ETHTOOL_A_RINGS_HEADER, + ETHTOOL_A_RINGS_RX_MAX, + ETHTOOL_A_RINGS_RX_MINI_MAX, + ETHTOOL_A_RINGS_RX_JUMBO_MAX, + ETHTOOL_A_RINGS_TX_MAX, + ETHTOOL_A_RINGS_RX, + ETHTOOL_A_RINGS_RX_MINI, + ETHTOOL_A_RINGS_RX_JUMBO, + ETHTOOL_A_RINGS_TX, + ETHTOOL_A_RINGS_RX_BUF_LEN, + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, + ETHTOOL_A_RINGS_CQE_SIZE, + ETHTOOL_A_RINGS_TX_PUSH, + ETHTOOL_A_RINGS_RX_PUSH, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, + + __ETHTOOL_A_RINGS_CNT, + ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) +}; + +enum { + ETHTOOL_A_MM_STAT_UNSPEC, + ETHTOOL_A_MM_STAT_PAD, + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, + ETHTOOL_A_MM_STAT_SMD_ERRORS, + ETHTOOL_A_MM_STAT_REASSEMBLY_OK, + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_HOLD_COUNT, + + __ETHTOOL_A_MM_STAT_CNT, + ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_MM_UNSPEC, + ETHTOOL_A_MM_HEADER, + ETHTOOL_A_MM_PMAC_ENABLED, + ETHTOOL_A_MM_TX_ENABLED, + ETHTOOL_A_MM_TX_ACTIVE, + ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_VERIFY_ENABLED, + ETHTOOL_A_MM_VERIFY_STATUS, + ETHTOOL_A_MM_VERIFY_TIME, + ETHTOOL_A_MM_MAX_VERIFY_TIME, + ETHTOOL_A_MM_STATS, + + __ETHTOOL_A_MM_CNT, + ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) +}; + +enum { + ETHTOOL_A_LINKINFO_UNSPEC, + ETHTOOL_A_LINKINFO_HEADER, + ETHTOOL_A_LINKINFO_PORT, + ETHTOOL_A_LINKINFO_PHYADDR, + ETHTOOL_A_LINKINFO_TP_MDIX, + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + ETHTOOL_A_LINKINFO_TRANSCEIVER, + + __ETHTOOL_A_LINKINFO_CNT, + ETHTOOL_A_LINKINFO_MAX = (__ETHTOOL_A_LINKINFO_CNT - 1) +}; + +enum { + ETHTOOL_A_LINKMODES_UNSPEC, + ETHTOOL_A_LINKMODES_HEADER, + ETHTOOL_A_LINKMODES_AUTONEG, + ETHTOOL_A_LINKMODES_OURS, + ETHTOOL_A_LINKMODES_PEER, + ETHTOOL_A_LINKMODES_SPEED, + ETHTOOL_A_LINKMODES_DUPLEX, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + ETHTOOL_A_LINKMODES_LANES, + ETHTOOL_A_LINKMODES_RATE_MATCHING, + + __ETHTOOL_A_LINKMODES_CNT, + ETHTOOL_A_LINKMODES_MAX = (__ETHTOOL_A_LINKMODES_CNT - 1) +}; + +enum { + ETHTOOL_A_LINKSTATE_UNSPEC, + ETHTOOL_A_LINKSTATE_HEADER, + ETHTOOL_A_LINKSTATE_LINK, + ETHTOOL_A_LINKSTATE_SQI, + ETHTOOL_A_LINKSTATE_SQI_MAX, + ETHTOOL_A_LINKSTATE_EXT_STATE, + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, + + __ETHTOOL_A_LINKSTATE_CNT, + ETHTOOL_A_LINKSTATE_MAX = (__ETHTOOL_A_LINKSTATE_CNT - 1) +}; + +enum { + ETHTOOL_A_DEBUG_UNSPEC, + ETHTOOL_A_DEBUG_HEADER, + ETHTOOL_A_DEBUG_MSGMASK, + + __ETHTOOL_A_DEBUG_CNT, + ETHTOOL_A_DEBUG_MAX = (__ETHTOOL_A_DEBUG_CNT - 1) +}; + +enum { + ETHTOOL_A_WOL_UNSPEC, + ETHTOOL_A_WOL_HEADER, + ETHTOOL_A_WOL_MODES, + ETHTOOL_A_WOL_SOPASS, + + __ETHTOOL_A_WOL_CNT, + ETHTOOL_A_WOL_MAX = (__ETHTOOL_A_WOL_CNT - 1) +}; + +enum { + ETHTOOL_A_FEATURES_UNSPEC, + ETHTOOL_A_FEATURES_HEADER, + ETHTOOL_A_FEATURES_HW, + ETHTOOL_A_FEATURES_WANTED, + ETHTOOL_A_FEATURES_ACTIVE, + ETHTOOL_A_FEATURES_NOCHANGE, + + __ETHTOOL_A_FEATURES_CNT, + ETHTOOL_A_FEATURES_MAX = (__ETHTOOL_A_FEATURES_CNT - 1) +}; + +enum { + ETHTOOL_A_CHANNELS_UNSPEC, + ETHTOOL_A_CHANNELS_HEADER, + ETHTOOL_A_CHANNELS_RX_MAX, + ETHTOOL_A_CHANNELS_TX_MAX, + ETHTOOL_A_CHANNELS_OTHER_MAX, + ETHTOOL_A_CHANNELS_COMBINED_MAX, + ETHTOOL_A_CHANNELS_RX_COUNT, + ETHTOOL_A_CHANNELS_TX_COUNT, + ETHTOOL_A_CHANNELS_OTHER_COUNT, + ETHTOOL_A_CHANNELS_COMBINED_COUNT, + + __ETHTOOL_A_CHANNELS_CNT, + ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) +}; + +enum { + ETHTOOL_A_IRQ_MODERATION_UNSPEC, + ETHTOOL_A_IRQ_MODERATION_USEC, + ETHTOOL_A_IRQ_MODERATION_PKTS, + ETHTOOL_A_IRQ_MODERATION_COMPS, + + __ETHTOOL_A_IRQ_MODERATION_CNT, + ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) +}; + +enum { + ETHTOOL_A_PROFILE_UNSPEC, + ETHTOOL_A_PROFILE_IRQ_MODERATION, + + __ETHTOOL_A_PROFILE_CNT, + ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) +}; + +enum { + ETHTOOL_A_COALESCE_UNSPEC, + ETHTOOL_A_COALESCE_HEADER, + ETHTOOL_A_COALESCE_RX_USECS, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES, + ETHTOOL_A_COALESCE_RX_USECS_IRQ, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_TX_USECS, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_USECS_IRQ, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, + ETHTOOL_A_COALESCE_PKT_RATE_LOW, + ETHTOOL_A_COALESCE_RX_USECS_LOW, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_TX_USECS_LOW, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_PKT_RATE_HIGH, + ETHTOOL_A_COALESCE_RX_USECS_HIGH, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_TX_USECS_HIGH, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, + ETHTOOL_A_COALESCE_RX_PROFILE, + ETHTOOL_A_COALESCE_TX_PROFILE, + + __ETHTOOL_A_COALESCE_CNT, + ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) +}; + +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_PAUSE_UNSPEC, + ETHTOOL_A_PAUSE_HEADER, + ETHTOOL_A_PAUSE_AUTONEG, + ETHTOOL_A_PAUSE_RX, + ETHTOOL_A_PAUSE_TX, + ETHTOOL_A_PAUSE_STATS, + ETHTOOL_A_PAUSE_STATS_SRC, + + __ETHTOOL_A_PAUSE_CNT, + ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) +}; + +enum { + ETHTOOL_A_EEE_UNSPEC, + ETHTOOL_A_EEE_HEADER, + ETHTOOL_A_EEE_MODES_OURS, + ETHTOOL_A_EEE_MODES_PEER, + ETHTOOL_A_EEE_ACTIVE, + ETHTOOL_A_EEE_ENABLED, + ETHTOOL_A_EEE_TX_LPI_ENABLED, + ETHTOOL_A_EEE_TX_LPI_TIMER, + + __ETHTOOL_A_EEE_CNT, + ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) +}; + +enum { + ETHTOOL_A_TS_STAT_UNSPEC, + ETHTOOL_A_TS_STAT_TX_PKTS, + ETHTOOL_A_TS_STAT_TX_LOST, + ETHTOOL_A_TS_STAT_TX_ERR, + + __ETHTOOL_A_TS_STAT_CNT, + ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_UNSPEC, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + + __ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX = (__ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT - 1) +}; + +enum { + ETHTOOL_A_TSINFO_UNSPEC, + ETHTOOL_A_TSINFO_HEADER, + ETHTOOL_A_TSINFO_TIMESTAMPING, + ETHTOOL_A_TSINFO_TX_TYPES, + ETHTOOL_A_TSINFO_RX_FILTERS, + ETHTOOL_A_TSINFO_PHC_INDEX, + ETHTOOL_A_TSINFO_STATS, + ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER, + + __ETHTOOL_A_TSINFO_CNT, + ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_RESULT_UNSPEC, + ETHTOOL_A_CABLE_RESULT_PAIR, + ETHTOOL_A_CABLE_RESULT_CODE, + ETHTOOL_A_CABLE_RESULT_SRC, + + __ETHTOOL_A_CABLE_RESULT_CNT, + ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, + ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, + + __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, + ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_NEST_UNSPEC, + ETHTOOL_A_CABLE_NEST_RESULT, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, + + __ETHTOOL_A_CABLE_NEST_CNT, + ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_UNSPEC, + ETHTOOL_A_CABLE_TEST_HEADER, + + __ETHTOOL_A_CABLE_TEST_CNT, + ETHTOOL_A_CABLE_TEST_MAX = (__ETHTOOL_A_CABLE_TEST_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_NEST, + + __ETHTOOL_A_CABLE_TEST_NTF_CNT, + ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, + + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, + + __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_CFG, + + __ETHTOOL_A_CABLE_TEST_TDR_CNT, + ETHTOOL_A_CABLE_TEST_TDR_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, + + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE, + + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, + + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + +enum { + ETHTOOL_A_FEC_STAT_UNSPEC, + ETHTOOL_A_FEC_STAT_PAD, + ETHTOOL_A_FEC_STAT_CORRECTED, + ETHTOOL_A_FEC_STAT_UNCORR, + ETHTOOL_A_FEC_STAT_CORR_BITS, + + __ETHTOOL_A_FEC_STAT_CNT, + ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, + ETHTOOL_A_FEC_MODES, + ETHTOOL_A_FEC_AUTO, + ETHTOOL_A_FEC_ACTIVE, + ETHTOOL_A_FEC_STATS, + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + +enum { + ETHTOOL_A_MODULE_EEPROM_UNSPEC, + ETHTOOL_A_MODULE_EEPROM_HEADER, + ETHTOOL_A_MODULE_EEPROM_OFFSET, + ETHTOOL_A_MODULE_EEPROM_LENGTH, + ETHTOOL_A_MODULE_EEPROM_PAGE, + ETHTOOL_A_MODULE_EEPROM_BANK, + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, + ETHTOOL_A_MODULE_EEPROM_DATA, + + __ETHTOOL_A_MODULE_EEPROM_CNT, + ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) +}; + +enum { + ETHTOOL_A_STATS_GRP_UNSPEC, + ETHTOOL_A_STATS_GRP_PAD, + ETHTOOL_A_STATS_GRP_ID, + ETHTOOL_A_STATS_GRP_SS_ID, + ETHTOOL_A_STATS_GRP_STAT, + ETHTOOL_A_STATS_GRP_HIST_RX, + ETHTOOL_A_STATS_GRP_HIST_TX, + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ETHTOOL_A_STATS_GRP_HIST_VAL, + + __ETHTOOL_A_STATS_GRP_CNT, + ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) +}; + +enum { + ETHTOOL_A_STATS_UNSPEC, + ETHTOOL_A_STATS_PAD, + ETHTOOL_A_STATS_HEADER, + ETHTOOL_A_STATS_GROUPS, + ETHTOOL_A_STATS_GRP, + ETHTOOL_A_STATS_SRC, + + __ETHTOOL_A_STATS_CNT, + ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +enum { + ETHTOOL_A_PHC_VCLOCKS_UNSPEC, + ETHTOOL_A_PHC_VCLOCKS_HEADER, + ETHTOOL_A_PHC_VCLOCKS_NUM, + ETHTOOL_A_PHC_VCLOCKS_INDEX, + + __ETHTOOL_A_PHC_VCLOCKS_CNT, + ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) +}; + +enum { + ETHTOOL_A_MODULE_UNSPEC, + ETHTOOL_A_MODULE_HEADER, + ETHTOOL_A_MODULE_POWER_MODE_POLICY, + ETHTOOL_A_MODULE_POWER_MODE, + + __ETHTOOL_A_MODULE_CNT, + ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) +}; + +enum { + ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, + ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, + ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, + + __ETHTOOL_A_C33_PSE_PW_LIMIT_CNT, + __ETHTOOL_A_C33_PSE_PW_LIMIT_MAX = (__ETHTOOL_A_C33_PSE_PW_LIMIT_CNT - 1) +}; + +enum { + ETHTOOL_A_PSE_UNSPEC, + ETHTOOL_A_PSE_HEADER, + ETHTOOL_A_PODL_PSE_ADMIN_STATE, + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, + ETHTOOL_A_PODL_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_ADMIN_STATE, + ETHTOOL_A_C33_PSE_ADMIN_CONTROL, + ETHTOOL_A_C33_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_PW_CLASS, + ETHTOOL_A_C33_PSE_ACTUAL_PW, + ETHTOOL_A_C33_PSE_EXT_STATE, + ETHTOOL_A_C33_PSE_EXT_SUBSTATE, + ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, + ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, + + __ETHTOOL_A_PSE_CNT, + ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) +}; + +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, + ETHTOOL_A_RSS_HFUNC, + ETHTOOL_A_RSS_INDIR, + ETHTOOL_A_RSS_HKEY, + ETHTOOL_A_RSS_INPUT_XFRM, + ETHTOOL_A_RSS_START_CONTEXT, + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1) +}; + +enum { + ETHTOOL_A_PLCA_UNSPEC, + ETHTOOL_A_PLCA_HEADER, + ETHTOOL_A_PLCA_VERSION, + ETHTOOL_A_PLCA_ENABLED, + ETHTOOL_A_PLCA_STATUS, + ETHTOOL_A_PLCA_NODE_CNT, + ETHTOOL_A_PLCA_NODE_ID, + ETHTOOL_A_PLCA_TO_TMR, + ETHTOOL_A_PLCA_BURST_CNT, + ETHTOOL_A_PLCA_BURST_TMR, + + __ETHTOOL_A_PLCA_CNT, + ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) +}; + +enum { + ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, + ETHTOOL_A_MODULE_FW_FLASH_HEADER, + ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, + ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, + ETHTOOL_A_MODULE_FW_FLASH_STATUS, + ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, + ETHTOOL_A_MODULE_FW_FLASH_DONE, + ETHTOOL_A_MODULE_FW_FLASH_TOTAL, + + __ETHTOOL_A_MODULE_FW_FLASH_CNT, + ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) +}; + +enum { + ETHTOOL_A_PHY_UNSPEC, + ETHTOOL_A_PHY_HEADER, + ETHTOOL_A_PHY_INDEX, + ETHTOOL_A_PHY_DRVNAME, + ETHTOOL_A_PHY_NAME, + ETHTOOL_A_PHY_UPSTREAM_TYPE, + ETHTOOL_A_PHY_UPSTREAM_INDEX, + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + + __ETHTOOL_A_PHY_CNT, + ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) +}; + +enum { + ETHTOOL_A_TSCONFIG_UNSPEC, + ETHTOOL_A_TSCONFIG_HEADER, + ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER, + ETHTOOL_A_TSCONFIG_TX_TYPES, + ETHTOOL_A_TSCONFIG_RX_FILTERS, + ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + + __ETHTOOL_A_TSCONFIG_CNT, + ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1) +}; + +enum { + ETHTOOL_MSG_USER_NONE = 0, + ETHTOOL_MSG_STRSET_GET = 1, + ETHTOOL_MSG_LINKINFO_GET, + ETHTOOL_MSG_LINKINFO_SET, + ETHTOOL_MSG_LINKMODES_GET, + ETHTOOL_MSG_LINKMODES_SET, + ETHTOOL_MSG_LINKSTATE_GET, + ETHTOOL_MSG_DEBUG_GET, + ETHTOOL_MSG_DEBUG_SET, + ETHTOOL_MSG_WOL_GET, + ETHTOOL_MSG_WOL_SET, + ETHTOOL_MSG_FEATURES_GET, + ETHTOOL_MSG_FEATURES_SET, + ETHTOOL_MSG_PRIVFLAGS_GET, + ETHTOOL_MSG_PRIVFLAGS_SET, + ETHTOOL_MSG_RINGS_GET, + ETHTOOL_MSG_RINGS_SET, + ETHTOOL_MSG_CHANNELS_GET, + ETHTOOL_MSG_CHANNELS_SET, + ETHTOOL_MSG_COALESCE_GET, + ETHTOOL_MSG_COALESCE_SET, + ETHTOOL_MSG_PAUSE_GET, + ETHTOOL_MSG_PAUSE_SET, + ETHTOOL_MSG_EEE_GET, + ETHTOOL_MSG_EEE_SET, + ETHTOOL_MSG_TSINFO_GET, + ETHTOOL_MSG_CABLE_TEST_ACT, + ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, + ETHTOOL_MSG_MODULE_EEPROM_GET, + ETHTOOL_MSG_STATS_GET, + ETHTOOL_MSG_PHC_VCLOCKS_GET, + ETHTOOL_MSG_MODULE_GET, + ETHTOOL_MSG_MODULE_SET, + ETHTOOL_MSG_PSE_GET, + ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, + ETHTOOL_MSG_PLCA_GET_CFG, + ETHTOOL_MSG_PLCA_SET_CFG, + ETHTOOL_MSG_PLCA_GET_STATUS, + ETHTOOL_MSG_MM_GET, + ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_MODULE_FW_FLASH_ACT, + ETHTOOL_MSG_PHY_GET, + ETHTOOL_MSG_TSCONFIG_GET, + ETHTOOL_MSG_TSCONFIG_SET, + + __ETHTOOL_MSG_USER_CNT, + ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) +}; + +enum { + ETHTOOL_MSG_KERNEL_NONE = 0, + ETHTOOL_MSG_STRSET_GET_REPLY = 1, + ETHTOOL_MSG_LINKINFO_GET_REPLY, + ETHTOOL_MSG_LINKINFO_NTF, + ETHTOOL_MSG_LINKMODES_GET_REPLY, + ETHTOOL_MSG_LINKMODES_NTF, + ETHTOOL_MSG_LINKSTATE_GET_REPLY, + ETHTOOL_MSG_DEBUG_GET_REPLY, + ETHTOOL_MSG_DEBUG_NTF, + ETHTOOL_MSG_WOL_GET_REPLY, + ETHTOOL_MSG_WOL_NTF, + ETHTOOL_MSG_FEATURES_GET_REPLY, + ETHTOOL_MSG_FEATURES_SET_REPLY, + ETHTOOL_MSG_FEATURES_NTF, + ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + ETHTOOL_MSG_PRIVFLAGS_NTF, + ETHTOOL_MSG_RINGS_GET_REPLY, + ETHTOOL_MSG_RINGS_NTF, + ETHTOOL_MSG_CHANNELS_GET_REPLY, + ETHTOOL_MSG_CHANNELS_NTF, + ETHTOOL_MSG_COALESCE_GET_REPLY, + ETHTOOL_MSG_COALESCE_NTF, + ETHTOOL_MSG_PAUSE_GET_REPLY, + ETHTOOL_MSG_PAUSE_NTF, + ETHTOOL_MSG_EEE_GET_REPLY, + ETHTOOL_MSG_EEE_NTF, + ETHTOOL_MSG_TSINFO_GET_REPLY, + ETHTOOL_MSG_CABLE_TEST_NTF, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, + ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + ETHTOOL_MSG_STATS_GET_REPLY, + ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + ETHTOOL_MSG_MODULE_GET_REPLY, + ETHTOOL_MSG_MODULE_NTF, + ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, + ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + ETHTOOL_MSG_PLCA_NTF, + ETHTOOL_MSG_MM_GET_REPLY, + ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_MODULE_FW_FLASH_NTF, + ETHTOOL_MSG_PHY_GET_REPLY, + ETHTOOL_MSG_PHY_NTF, + ETHTOOL_MSG_TSCONFIG_GET_REPLY, + ETHTOOL_MSG_TSCONFIG_SET_REPLY, + + __ETHTOOL_MSG_KERNEL_CNT, + ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) +}; + +#endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index a6924dd3aff1..00e9890ca3c0 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -68,6 +68,8 @@ enum { FRA_SPORT_RANGE, /* sport */ FRA_DPORT_RANGE, /* dport */ FRA_DSCP, /* dscp */ + FRA_FLOWLABEL, /* flowlabel */ + FRA_FLOWLABEL_MASK, /* flowlabel mask */ __FRA_MAX }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b48..77730c340c8f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1394,6 +1394,7 @@ enum { IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ + IFLA_VXLAN_RESERVED_BITS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 858339d1c1c4..55b0ab51096c 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,6 +13,17 @@ #include <linux/types.h> #include <linux/socket.h> /* for SO_TIMESTAMPING */ +/* + * Possible type of hwtstamp provider. Mainly "precise" the default one + * is for IEEE 1588 quality and "approx" is for NICs DMA point. + */ +enum hwtstamp_provider_qualifier { + HWTSTAMP_PROVIDER_QUALIFIER_PRECISE, + HWTSTAMP_PROVIDER_QUALIFIER_APPROX, + + HWTSTAMP_PROVIDER_QUALIFIER_CNT, +}; + /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index db7254d52d93..5ee94c511a28 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -93,7 +93,11 @@ enum { RTM_NEWPREFIX = 52, #define RTM_NEWPREFIX RTM_NEWPREFIX - RTM_GETMULTICAST = 58, + RTM_NEWMULTICAST = 56, +#define RTM_NEWMULTICAST RTM_NEWMULTICAST + RTM_DELMULTICAST, +#define RTM_DELMULTICAST RTM_DELMULTICAST + RTM_GETMULTICAST, #define RTM_GETMULTICAST RTM_GETMULTICAST RTM_GETANYCAST = 62, @@ -389,6 +393,7 @@ enum rtattr_type_t { RTA_SPORT, RTA_DPORT, RTA_NH_ID, + RTA_FLOWLABEL, __RTA_MAX }; @@ -774,6 +779,10 @@ enum rtnetlink_groups { #define RTNLGRP_TUNNEL RTNLGRP_TUNNEL RTNLGRP_STATS, #define RTNLGRP_STATS RTNLGRP_STATS + RTNLGRP_IPV4_MCADDR, +#define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR + RTNLGRP_IPV6_MCADDR, +#define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index adf5fd78dd50..51da2e00112d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -358,6 +358,11 @@ enum LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ LINUX_MIB_TLSDECRYPTRETRY, /* TlsDecryptRetry */ LINUX_MIB_TLSRXNOPADVIOL, /* TlsRxNoPadViolation */ + LINUX_MIB_TLSRXREKEYOK, /* TlsRxRekeyOk */ + LINUX_MIB_TLSRXREKEYERROR, /* TlsRxRekeyError */ + LINUX_MIB_TLSTXREKEYOK, /* TlsTxRekeyOk */ + LINUX_MIB_TLSTXREKEYERROR, /* TlsTxRekeyError */ + LINUX_MIB_TLSRXREKEYRECEIVED, /* TlsRxRekeyReceived */ __LINUX_MIB_TLSMAX }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b..774accbd4a22 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3aa002a47a96..482d284a1553 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { int err; @@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; @@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; diff --git a/lib/packing.c b/lib/packing.c index 793942745e34..bb1643d9e64d 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -5,10 +5,37 @@ #include <linux/packing.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/bits.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/bitrev.h> +#define __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks) \ + ({ \ + for (size_t i = 0; i < (num_fields); i++) { \ + typeof(&(fields)[0]) field = &(fields)[i]; \ + u64 uval; \ + \ + uval = ustruct_field_to_u64(ustruct, field->offset, field->size); \ + \ + __pack(pbuf, uval, field->startbit, field->endbit, \ + pbuflen, quirks); \ + } \ + }) + +#define __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks) \ + ({ \ + for (size_t i = 0; i < (num_fields); i++) { \ + typeof(&(fields)[0]) field = &fields[i]; \ + u64 uval; \ + \ + __unpack(pbuf, &uval, field->startbit, field->endbit, \ + pbuflen, quirks); \ + \ + u64_to_ustruct_field(ustruct, field->offset, field->size, uval); \ + } \ + }) + /** * calculate_box_addr - Determine physical location of byte in buffer * @box: Index of byte within buffer seen as a logical big-endian big number @@ -51,64 +78,29 @@ static size_t calculate_box_addr(size_t box, size_t len, u8 quirks) return offset_of_group + offset_in_group; } -/** - * pack - Pack u64 number into bitfield of buffer. - * - * @pbuf: Pointer to a buffer holding the packed value. - * @uval: CPU-readable unpacked value to pack. - * @startbit: The index (in logical notation, compensated for quirks) where - * the packed value starts within pbuf. Must be larger than, or - * equal to, endbit. - * @endbit: The index (in logical notation, compensated for quirks) where - * the packed value ends within pbuf. Must be smaller than, or equal - * to, startbit. - * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. - * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and - * QUIRK_MSB_ON_THE_RIGHT. - * - * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @pbuf memory will - * be modified on success. - */ -int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, - u8 quirks) +static void __pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) { /* Logical byte indices corresponding to the * start and end of the field. */ - int plogical_first_u8, plogical_last_u8, box; - /* width of the field to access in the pbuf */ - u64 value_width; - - /* startbit is expected to be larger than endbit, and both are - * expected to be within the logically addressable range of the buffer. - */ - if (unlikely(startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen)) - /* Invalid function call */ - return -EINVAL; - - value_width = startbit - endbit + 1; - if (unlikely(value_width > 64)) - return -ERANGE; + int plogical_first_u8 = startbit / BITS_PER_BYTE; + int plogical_last_u8 = endbit / BITS_PER_BYTE; + int value_width = startbit - endbit + 1; + int box; /* Check if "uval" fits in "value_width" bits. - * If value_width is 64, the check will fail, but any - * 64-bit uval will surely fit. + * The test only works for value_width < 64, but in the latter case, + * any 64-bit uval will surely fit. */ - if (unlikely(value_width < 64 && uval >= (1ull << value_width))) - /* Cannot store "uval" inside "value_width" bits. - * Truncating "uval" is most certainly not desirable, - * so simply erroring out is appropriate. - */ - return -ERANGE; + WARN(value_width < 64 && uval >= (1ull << value_width), + "Cannot store 0x%llx inside bits %zu-%zu - will truncate\n", + uval, startbit, endbit); /* Iterate through an idealistic view of the pbuf as an u64 with * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / BITS_PER_BYTE; - plogical_last_u8 = endbit / BITS_PER_BYTE; - for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ size_t box_start_bit, box_end_bit, box_addr; @@ -163,15 +155,13 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, ((u8 *)pbuf)[box_addr] &= ~box_mask; ((u8 *)pbuf)[box_addr] |= pval; } - return 0; } -EXPORT_SYMBOL(pack); /** - * unpack - Unpack u64 number from packed buffer. + * pack - Pack u64 number into bitfield of buffer. * * @pbuf: Pointer to a buffer holding the packed value. - * @uval: Pointer to an u64 holding the unpacked value. + * @uval: CPU-readable unpacked value to pack. * @startbit: The index (in logical notation, compensated for quirks) where * the packed value starts within pbuf. Must be larger than, or * equal to, endbit. @@ -183,19 +173,12 @@ EXPORT_SYMBOL(pack); * QUIRK_MSB_ON_THE_RIGHT. * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @uval will be - * modified on success. + * correct usage, return code may be discarded. The @pbuf memory will + * be modified on success. */ -int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks) +int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, + u8 quirks) { - /* Logical byte indices corresponding to the - * start and end of the field. - */ - int plogical_first_u8, plogical_last_u8, box; - /* width of the field to access in the pbuf */ - u64 value_width; - /* startbit is expected to be larger than endbit, and both are * expected to be within the logically addressable range of the buffer. */ @@ -203,10 +186,25 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, /* Invalid function call */ return -EINVAL; - value_width = startbit - endbit + 1; - if (unlikely(value_width > 64)) + if (unlikely(startbit - endbit >= 64)) return -ERANGE; + __pack(pbuf, uval, startbit, endbit, pbuflen, quirks); + + return 0; +} +EXPORT_SYMBOL(pack); + +static void __unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + /* Logical byte indices corresponding to the + * start and end of the field. + */ + int plogical_first_u8 = startbit / BITS_PER_BYTE; + int plogical_last_u8 = endbit / BITS_PER_BYTE; + int box; + /* Initialize parameter */ *uval = 0; @@ -214,9 +212,6 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / BITS_PER_BYTE; - plogical_last_u8 = endbit / BITS_PER_BYTE; - for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ size_t box_start_bit, box_end_bit, box_addr; @@ -271,6 +266,46 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, *uval &= ~proj_mask; *uval |= pval; } +} + +/** + * unpack - Unpack u64 number from packed buffer. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. The @uval will be + * modified on success. + */ +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + /* width of the field to access in the pbuf */ + u64 value_width; + + /* startbit is expected to be larger than endbit, and both are + * expected to be within the logically addressable range of the buffer. + */ + if (startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen) + /* Invalid function call */ + return -EINVAL; + + value_width = startbit - endbit + 1; + if (value_width > 64) + return -ERANGE; + + __unpack(pbuf, uval, startbit, endbit, pbuflen, quirks); + return 0; } EXPORT_SYMBOL(unpack); @@ -314,4 +349,130 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, } EXPORT_SYMBOL(packing); +static u64 ustruct_field_to_u64(const void *ustruct, size_t field_offset, + size_t field_size) +{ + switch (field_size) { + case 1: + return *((u8 *)(ustruct + field_offset)); + case 2: + return *((u16 *)(ustruct + field_offset)); + case 4: + return *((u32 *)(ustruct + field_offset)); + default: + return *((u64 *)(ustruct + field_offset)); + } +} + +static void u64_to_ustruct_field(void *ustruct, size_t field_offset, + size_t field_size, u64 uval) +{ + switch (field_size) { + case 1: + *((u8 *)(ustruct + field_offset)) = uval; + break; + case 2: + *((u16 *)(ustruct + field_offset)) = uval; + break; + case 4: + *((u32 *)(ustruct + field_offset)) = uval; + break; + default: + *((u64 *)(ustruct + field_offset)) = uval; + break; + } +} + +/** + * pack_fields_u8 - Pack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u8 definitions. + * @fields: Array of packed_field_u8 field definition. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the pack_fields() macro instead of calling this directly. + */ +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks) +{ + __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(pack_fields_u8); + +/** + * pack_fields_u16 - Pack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u16 definitions. + * @fields: Array of packed_field_u16 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the pack_fields() macro instead of calling this directly. + */ +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks) +{ + __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(pack_fields_u16); + +/** + * unpack_fields_u8 - Unpack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u8 definitions. + * @fields: Array of packed_field_u8 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the unpack_fields() macro instead of calling this directly. + */ +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks) +{ + __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(unpack_fields_u8); + +/** + * unpack_fields_u16 - Unpack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u16 definitions. + * @fields: Array of packed_field_u16 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the unpack_fields() macro instead of calling this directly. + */ +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks) +{ + __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(unpack_fields_u16); + MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); diff --git a/lib/packing_test.c b/lib/packing_test.c index b38ea43c03fd..ce3b83d33b04 100644 --- a/lib/packing_test.c +++ b/lib/packing_test.c @@ -396,9 +396,70 @@ static void packing_test_unpack(struct kunit *test) KUNIT_EXPECT_EQ(test, uval, params->uval); } +#define PACKED_BUF_SIZE 8 + +typedef struct __packed { u8 buf[PACKED_BUF_SIZE]; } packed_buf_t; + +struct test_data { + u32 field3; + u16 field2; + u16 field4; + u16 field6; + u8 field1; + u8 field5; +}; + +static const struct packed_field_u8 test_fields[] = { + PACKED_FIELD(63, 61, struct test_data, field1), + PACKED_FIELD(60, 52, struct test_data, field2), + PACKED_FIELD(51, 28, struct test_data, field3), + PACKED_FIELD(27, 14, struct test_data, field4), + PACKED_FIELD(13, 9, struct test_data, field5), + PACKED_FIELD(8, 0, struct test_data, field6), +}; + +static void packing_test_pack_fields(struct kunit *test) +{ + const struct test_data data = { + .field1 = 0x2, + .field2 = 0x100, + .field3 = 0xF00050, + .field4 = 0x7D3, + .field5 = 0x9, + .field6 = 0x10B, + }; + packed_buf_t expect = { + .buf = { 0x50, 0x0F, 0x00, 0x05, 0x01, 0xF4, 0xD3, 0x0B }, + }; + packed_buf_t buf = {}; + + pack_fields(&buf, sizeof(buf), &data, test_fields, 0); + + KUNIT_EXPECT_MEMEQ(test, &expect, &buf, sizeof(buf)); +} + +static void packing_test_unpack_fields(struct kunit *test) +{ + const packed_buf_t buf = { + .buf = { 0x17, 0x28, 0x10, 0x19, 0x3D, 0xA9, 0x07, 0x9C }, + }; + struct test_data data = {}; + + unpack_fields(&buf, sizeof(buf), &data, test_fields, 0); + + KUNIT_EXPECT_EQ(test, 0, data.field1); + KUNIT_EXPECT_EQ(test, 0x172, data.field2); + KUNIT_EXPECT_EQ(test, 0x810193, data.field3); + KUNIT_EXPECT_EQ(test, 0x36A4, data.field4); + KUNIT_EXPECT_EQ(test, 0x3, data.field5); + KUNIT_EXPECT_EQ(test, 0x19C, data.field6); +} + static struct kunit_case packing_test_cases[] = { KUNIT_CASE_PARAM(packing_test_pack, packing_gen_params), KUNIT_CASE_PARAM(packing_test_unpack, packing_gen_params), + KUNIT_CASE(packing_test_pack_fields), + KUNIT_CASE(packing_test_unpack_fields), {}, }; diff --git a/lib/win_minmax.c b/lib/win_minmax.c index ec10506834b6..1682e614309c 100644 --- a/lib/win_minmax.c +++ b/lib/win_minmax.c @@ -97,3 +97,4 @@ u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) return minmax_subwin_update(m, win, &val); } +EXPORT_SYMBOL(minmax_running_min); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 501ec4249fed..9ae2a7f1738b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -153,7 +153,7 @@ static void xdp_test_run_init_page(netmem_ref netmem, void *arg) new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); - frm->mem = new_ctx->rxq->mem; + frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } @@ -246,7 +246,7 @@ static void reset_ctx(struct xdp_page_head *head) head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); - head->frame->mem = head->orig_ctx.rxq->mem; + head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82bac2426631..902694c0ce64 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -955,6 +955,7 @@ int br_fdb_dump(struct sk_buff *skb, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; @@ -970,7 +971,7 @@ int br_fdb_dump(struct sk_buff *skb, rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index ea733542244c..c1176a5e02c4 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1002,7 +1002,7 @@ static const struct attribute_group bridge_group = { * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); @@ -1023,10 +1023,10 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, return n; } -static struct bin_attribute bridge_forward = { +static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, - .read = brforward_read, + .read_new = brforward_read, }; /* diff --git a/net/can/raw.c b/net/can/raw.c index 255c0a8f39d6..46e8ed9d64da 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -962,7 +962,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc.priority; skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..c7f3dea3e0eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -460,7 +460,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* @@ -4931,7 +4931,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -5033,7 +5033,7 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, } static int -netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; @@ -5057,7 +5057,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; @@ -5110,7 +5110,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -5135,7 +5135,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; @@ -12152,11 +12152,18 @@ static int net_page_pool_create(int cpuid) .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; + int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; @@ -12290,6 +12297,7 @@ out: if (!pp_ptr) continue; + xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a68..aa91eed55a40 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -310,5 +310,8 @@ static inline void dev_xmit_recursion_dec(void) int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 46d43b950471..087a57b7e4fa 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -6,6 +6,7 @@ #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> +#include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> @@ -184,7 +185,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; @@ -266,9 +267,24 @@ static int dev_eth_ioctl(struct net_device *dev, * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ -static int dev_get_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg) +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) { + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); @@ -324,11 +340,32 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - bool phy_ts = phy_is_default_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; bool changed = false; + bool phy_ts; int err; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && dev->see_all_hwtstamp_requests) { @@ -350,7 +387,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { - err = phy_hwtstamp_set(dev->phydev, cfg, extack); + err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee11..0b6ed7525b22 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -331,11 +331,11 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 34185d138c95..e684ba3ebb38 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -770,6 +770,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..56fe194a76af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4119,13 +4119,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - struct xdp_mem_info *mem_info, bool release) + enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_info, false, zc_frag); + __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4134,19 +4134,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { - struct xdp_mem_info *mem_info = &xdp->rxq->mem; + enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; - if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } - if (release) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), mem_info, false, NULL); - } + if (release) + __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; @@ -4348,9 +4345,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4371,10 +4368,10 @@ err: return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4443,7 +4440,7 @@ err: } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4457,7 +4454,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4472,9 +4470,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, void *fwd, - enum bpf_map_type map_type, u32 map_id, - u32 flags) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; @@ -4528,7 +4526,8 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -9070,7 +9069,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2e459b9d88eb..6f2647b000b8 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -390,7 +390,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; @@ -414,7 +414,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) - return; + return -ENOMEM; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); @@ -490,7 +490,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb->dev = np->dev; - netpoll_send_skb(np, skb); + return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); @@ -634,7 +634,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!rcu_access_pointer(ndev->npinfo)) { + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -654,7 +655,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb4..9733206d6406 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -201,6 +201,7 @@ static int page_pool_init(struct page_pool *pool, memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) @@ -287,6 +288,9 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_priv) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, @@ -574,7 +578,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; @@ -590,11 +594,11 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } -EXPORT_SYMBOL(page_pool_alloc_netmem); +EXPORT_SYMBOL(page_pool_alloc_netmems); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { - return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); @@ -839,69 +843,104 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_unrefed_page); -/** - * page_pool_put_page_bulk() - release references on multiple pages - * @pool: pool from which pages were allocated - * @data: array holding page pointers - * @count: number of pages in @data - * - * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring - * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() - * will release leftover pages to the page allocator. - * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx - * completion loop for the XDP_REDIRECT use case. - * - * Please note the caller must not use data area after running - * page_pool_put_page_bulk(), as this function overwrites it. - */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) { - int i, bulk_len = 0; - bool allow_direct; bool in_softirq; + u32 i; - allow_direct = page_pool_napi_local(pool); - - for (i = 0; i < count; i++) { - netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i])); - - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(netmem)) - continue; - - netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); - /* Approved for bulk recycling in ptr_ring cache */ - if (netmem) - data[bulk_len++] = (__force void *)netmem; - } - - if (!bulk_len) - return; - - /* Bulk producer into ptr_ring page_pool cache */ + /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); + for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } - recycle_stat_add(pool, ring, i); + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); - /* Hopefully all pages was return into ptr_ring */ + /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, (__force netmem_ref)data[i]); + page_pool_return_page(pool, bulk[i]); +} + +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } } -EXPORT_SYMBOL(page_pool_put_page_bulk); +EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) @@ -957,7 +996,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, } if (!netmem) { - netmem = page_pool_alloc_netmem(pool, gfp); + netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e23cacbe66e..ee95dbb0539a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3883,17 +3883,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d9f959c619d9..6b745096809d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4765,15 +4765,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -4912,18 +4913,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net_device *dev; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; int fidx = 0; + int err; + + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, @@ -4942,70 +4941,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - - if (brport_idx && (dev->ifindex != brport_idx)) - continue; - - if (!br_idx) { /* user did not specify a specific bridge */ - if (netif_is_bridge_port(dev)) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } - } else { - if (dev != br_dev && - !netif_is_bridge_port(dev)) - continue; + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !netif_is_bridge_master(dev)) - continue; - cops = ops; + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; - if (idx < s_idx) - goto cont; + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } - if (netif_is_bridge_port(dev)) { - if (cops && cops->ndo_fdb_dump) { - err = cops->ndo_fdb_dump(skb, cb, - br_dev, dev, - &fidx); - if (err == -EMSGSIZE) - goto out; - } + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; } + } - if (dev->netdev_ops->ndo_fdb_dump) - err = dev->netdev_ops->ndo_fdb_dump(skb, cb, - dev, NULL, - &fidx); - else - err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, - &fidx); - if (err == -EMSGSIZE) - goto out; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; - cops = NULL; + cops = NULL; - /* reset fdb offset to 0 for rest of the interfaces */ - cb->args[2] = 0; - fidx = 0; -cont: - idx++; - } + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } -out: - cb->args[0] = h; - cb->args[1] = idx; - cb->args[2] = fidx; + ctx->fdb_idx = fidx; return skb->len; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6841e61a6bd0..a441613a1e6c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1009,7 +1009,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog) + const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..e7bcc8952248 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -454,6 +454,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } +static bool sk_set_prio_allowed(const struct sock *sk, int val) +{ + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); +} + static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -1193,9 +1200,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname, /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (sk_set_prio_allowed(sk, val)) { sock_set_priority(sk, val); return 0; } @@ -1514,6 +1519,10 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); + break; + case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1942,6 +1951,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sock_flag(sk, SOCK_RCVMARK); break; + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -2942,6 +2955,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 3717fb152ecc..a50a7ef49ae8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -9,6 +9,7 @@ #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <linux/export.h> +#include <linux/ptp_clock_kernel.h> static unsigned int classify(const struct sk_buff *skb) { @@ -21,19 +22,39 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; struct sk_buff *clone; unsigned int type; - if (!skb->sk || !skb->dev || - !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->sk || !skb->dev) return; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + type = classify(skb); if (type == PTP_CLASS_NONE) return; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) @@ -45,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; unsigned int type; - if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->dev) return false; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -63,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->rxtstamp)) return mii_ts->rxtstamp(mii_ts, skb, type); diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424..f1165a35411b 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -358,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; @@ -365,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) { - struct page *page; - - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: - page = virt_to_head_page(data); + netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) * as mem->type knows this a page_pool page */ - page_pool_put_full_page(page->pp, page, napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); break; case MEM_TYPE_PAGE_SHARED: - page_frag_free(data); + page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: - page = virt_to_page(data); /* Assumes order0 page*/ - put_page(page); + put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ @@ -399,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -407,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -452,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; @@ -500,31 +526,29 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); - } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); @@ -570,7 +594,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -640,7 +664,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - if (xdpf->mem.type == MEM_TYPE_PAGE_POOL) + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ @@ -687,8 +711,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } diff --git a/net/devlink/health.c b/net/devlink/health.c index b8d3084e6fe0..57db6799722a 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -1238,3 +1238,70 @@ int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, return reporter->ops->test(reporter, info->extack); } + +/** + * devlink_fmsg_dump_skb - Dump sk_buffer structure + * @fmsg: devlink formatted message pointer + * @skb: pointer to skb + * + * Dump diagnostic information about sk_buff structure, like headroom, length, + * tailroom, MAC, etc. + */ +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + struct sock *sk = skb->sk; + bool has_mac, has_trans; + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + devlink_fmsg_pair_nest_start(fmsg, "skb"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "actual len", skb->len); + devlink_fmsg_put(fmsg, "head len", skb_headlen(skb)); + devlink_fmsg_put(fmsg, "data len", skb->data_len); + devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb)); + devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1); + devlink_fmsg_put(fmsg, "MAC len", + has_mac ? skb_mac_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "network hdr", skb->network_header); + devlink_fmsg_put(fmsg, "network hdr len", + has_trans ? skb_network_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "transport hdr", + has_trans ? skb->transport_header : -1); + devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum); + devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed); + devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw); + devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid); + devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level); + devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash); + devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash); + devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol)); + devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type); + devlink_fmsg_put(fmsg, "iif", skb->skb_iif); + + if (sk) { + devlink_fmsg_pair_nest_start(fmsg, "sk"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "family", sk->sk_type); + devlink_fmsg_put(fmsg, "type", sk->sk_type); + devlink_fmsg_put(fmsg, "proto", sk->sk_protocol); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + } + + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + + devlink_fmsg_pair_nest_start(fmsg, "shinfo"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags); + devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags); + devlink_fmsg_put(fmsg, "gso_size", sh->gso_size); + devlink_fmsg_put(fmsg, "gso_type", sh->gso_type); + devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb); diff --git a/net/dsa/port.c b/net/dsa/port.c index ee0aaec4c8e0..5c9d1798e830 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1575,6 +1575,22 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } +/* dsa_supports_eee - indicate that EEE is supported + * @ds: pointer to &struct dsa_switch + * @port: port index + * + * A default implementation for the .support_eee() DSA operations member, + * which drivers can use to indicate that they support EEE on all of their + * user ports. + * + * Returns: true + */ +bool dsa_supports_eee(struct dsa_switch *ds, int port) +{ + return true; +} +EXPORT_SYMBOL_GPL(dsa_supports_eee); + static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) diff --git a/net/dsa/user.c b/net/dsa/user.c index 06c30a9e29ff..4a8de48a6f24 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -515,12 +515,13 @@ dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, @@ -1228,8 +1229,12 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) struct dsa_switch *ds = dp->ds; int ret; + /* Check whether the switch supports EEE */ + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) + return -EOPNOTSUPP; + /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -1248,8 +1253,12 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) struct dsa_switch *ds = dp->ds; int ret; + /* Check whether the switch supports EEE */ + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) + return -EOPNOTSUPP; + /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 9b540644ba31..a1490c4afe6b 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -9,4 +9,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ - phy.o + phy.o tsconfig.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 05ce4f8080b3..02f941f667dd 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -5,9 +5,12 @@ #include <linux/phy.h> #include <linux/rtnetlink.h> #include <linux/ptp_clock_kernel.h> +#include <linux/phy_link_topology.h> #include "netlink.h" #include "common.h" +#include "../core/dev.h" + const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_SG_BIT] = "tx-scatter-gather", @@ -763,20 +766,98 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } -int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) +static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) { - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - int err = 0; - memset(info, 0, sizeof(*info)); info->cmd = ETHTOOL_GET_TS_INFO; info->phc_index = -1; +} + +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int err; + + if (!ops->get_ts_info) + return -ENODEV; + + /* Does ptp comes from netdev */ + ethtool_init_tsinfo(info); + info->phc_qualifier = hwprov_desc->qualifier; + err = ops->get_ts_info(dev, info); + if (err) + return err; + + if (info->phc_index == hwprov_desc->index && + net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier)) + return 0; + + return -ENODEV; +} + +struct phy_device * +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; + + /* Only precise qualifier is supported in phydev */ + if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return ERR_PTR(-ENODEV); + + /* Look in the phy topology */ + if (dev->link_topo) { + struct phy_device_node *pdn; + unsigned long phy_index; + + xa_for_each(&dev->link_topo->phys, phy_index, pdn) { + if (!phy_has_tsinfo(pdn->phy)) + continue; + + ethtool_init_tsinfo(info); + err = phy_ts_info(pdn->phy, info); + if (err) + return ERR_PTR(err); + + if (info->phc_index == hwprov_desc->index) + return pdn->phy; + } + return ERR_PTR(-ENODEV); + } + + /* Look on the dev->phydev */ + if (phy_has_tsinfo(dev->phydev)) { + ethtool_init_tsinfo(info); + err = phy_ts_info(dev->phydev, info); + if (err) + return ERR_PTR(err); - if (phy_is_default_hwtstamp(phydev) && phy_has_tsinfo(phydev)) - err = phy_ts_info(phydev, info); - else if (ops->get_ts_info) - err = ops->get_ts_info(dev, info); + if (info->phc_index == hwprov_desc->index) + return dev->phydev; + } + + return ERR_PTR(-ENODEV); +} + +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; + + err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); + if (err == -ENODEV) { + struct phy_device *phy; + + phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); + if (IS_ERR(phy)) + err = PTR_ERR(phy); + else + err = 0; + } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; @@ -784,6 +865,55 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info return err; } +int __ethtool_get_ts_info(struct net_device *dev, + struct kernel_ethtool_ts_info *info) +{ + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + /* No provider specified, use default behavior */ + if (!hwprov) { + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + int err = 0; + + ethtool_init_tsinfo(info); + if (phy_is_default_hwtstamp(phydev) && + phy_has_tsinfo(phydev)) + err = phy_ts_info(phydev, info); + else if (ops->get_ts_info) + err = ops->get_ts_info(dev, info); + + info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + return err; + } + + return ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); +} + +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return false; + + /* Return true with precise qualifier and with NIC without + * qualifier description to not break the old behavior. + */ + if (!ops->supported_hwtstamp_qualifiers && + qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return true; + + if (ops->supported_hwtstamp_qualifiers & BIT(qualifier)) + return true; + + return false; +} + int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) { struct kernel_ethtool_ts_info info = { }; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 4a2de3ce7354..850eadde4bfc 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -21,6 +21,7 @@ struct link_mode_info { }; struct genl_info; +struct hwtstamp_provider_desc; extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; @@ -49,6 +50,18 @@ int ethtool_check_max_channel(struct net_device *dev, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +struct phy_device * +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier); extern const struct ethtool_phy_ops *ethtool_phy_ops; extern const struct ethtool_pse_ops *ethtool_pse_ops; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e3f0ef6b851b..849c98e637c6 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -394,6 +394,8 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, + [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1074,9 +1076,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_tsinfo_start, + .dumpit = ethnl_tsinfo_dumpit, + .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, @@ -1243,6 +1245,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_tsconfig_get_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_tsconfig_set_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 203b08eb6c6f..0a09298fff92 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -435,6 +435,7 @@ extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; +extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -464,7 +465,7 @@ extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; -extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1]; +extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; @@ -485,6 +486,8 @@ extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); @@ -499,6 +502,9 @@ int ethnl_phy_start(struct netlink_callback *cb); int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_done(struct netlink_callback *cb); +int ethnl_tsinfo_start(struct netlink_callback *cb); +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_tsinfo_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h new file mode 100644 index 000000000000..d901a879a671 --- /dev/null +++ b/net/ethtool/ts.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_TS_H +#define _NET_ETHTOOL_TS_H + +#include "netlink.h" + +static const struct nla_policy +ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = { + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] = + NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1) +}; + +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod); + +#endif /* _NET_ETHTOOL_TS_H */ diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c new file mode 100644 index 000000000000..9188e088fb2f --- /dev/null +++ b/net/ethtool/tsconfig.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net_tstamp.h> +#include <linux/ptp_clock_kernel.h> + +#include "netlink.h" +#include "common.h" +#include "bitset.h" +#include "../core/dev.h" +#include "ts.h" + +struct tsconfig_req_info { + struct ethnl_req_info base; +}; + +struct tsconfig_reply_data { + struct ethnl_reply_data base; + struct hwtstamp_provider_desc hwprov_desc; + struct { + u32 tx_type; + u32 rx_filter; + u32 flags; + } hwtst_config; +}; + +#define TSCONFIG_REPDATA(__reply_base) \ + container_of(__reply_base, struct tsconfig_reply_data, base) + +const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = reply_base->dev; + struct kernel_hwtstamp_config cfg = {}; + int ret; + + if (!dev->netdev_ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = dev_get_hwtstamp_phylib(dev, &cfg); + if (ret) + goto out; + + data->hwtst_config.tx_type = BIT(cfg.tx_type); + data->hwtst_config.rx_filter = BIT(cfg.rx_filter); + data->hwtst_config.flags = BIT(cfg.flags); + + data->hwprov_desc.index = -1; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + data->hwprov_desc.index = hwprov->desc.index; + data->hwprov_desc.qualifier = hwprov->desc.qualifier; + } else { + struct kernel_ethtool_ts_info ts_info = {}; + + ts_info.phc_index = -1; + ret = __ethtool_get_ts_info(dev, &ts_info); + if (ret) + goto out; + + if (ts_info.phc_index == -1) + return -ENODEV; + + data->hwprov_desc.index = ts_info.phc_index; + data->hwprov_desc.qualifier = ts_info.phc_qualifier; + } + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int tsconfig_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len = 0; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + + if (data->hwtst_config.flags) + /* _TSCONFIG_HWTSTAMP_FLAGS */ + len += nla_total_size(sizeof(u32)); + + if (data->hwtst_config.tx_type) { + ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, + NULL, __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_TX_TYPES */ + } + if (data->hwtst_config.rx_filter) { + ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_RX_FILTERS */ + } + + if (data->hwprov_desc.index >= 0) + /* _TSCONFIG_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + + 2 * nla_total_size(sizeof(u32)); + + return len; +} + +static int tsconfig_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (data->hwtst_config.flags) { + ret = nla_put_u32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + data->hwtst_config.flags); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.tx_type) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES, + &data->hwtst_config.tx_type, NULL, + __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.rx_filter) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS, + &data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwprov_desc.index >= 0) { + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + data->hwprov_desc.index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + data->hwprov_desc.qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } + return 0; +} + +/* TSCONFIG_SET */ +const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_U32 }, + [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, + [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, +}; + +static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) +{ + struct tsconfig_reply_data *reply_data; + struct tsconfig_req_info *req_info; + struct sk_buff *rskb; + void *reply_payload; + int reply_len = 0; + int ret; + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ASSERT_RTNL(); + reply_data->base.dev = dev; + ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info); + if (ret < 0) + goto err_cleanup; + + ret = tsconfig_reply_size(&req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + reply_len = ret + ethnl_reply_header_size(); + rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, + ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); + if (!rskb) + goto err_cleanup; + + ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + genlmsg_end(rskb, reply_payload); + ret = genlmsg_reply(rskb, info); + +err_cleanup: + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + const struct net_device_ops *ops = req_base->dev->netdev_ops; + + if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + return 1; +} + +static struct hwtstamp_provider * +tsconfig_set_hwprov_from_desc(struct net_device *dev, + struct genl_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + struct kernel_ethtool_ts_info ts_info; + struct hwtstamp_provider *hwprov; + struct nlattr **tb = info->attrs; + struct phy_device *phy = NULL; + enum hwtstamp_source source; + int ret; + + ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (!ret) { + /* Found */ + source = HWTSTAMP_SOURCE_NETDEV; + } else { + phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (IS_ERR(phy)) { + if (PTR_ERR(phy) == -ENODEV) + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + "phc not in this net device topology"); + return ERR_CAST(phy); + } + + source = HWTSTAMP_SOURCE_PHYLIB; + } + + hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL); + if (!hwprov) + return ERR_PTR(-ENOMEM); + + hwprov->desc.index = hwprov_desc->index; + hwprov->desc.qualifier = hwprov_desc->qualifier; + hwprov->source = source; + hwprov->phydev = phy; + + return hwprov; +} + +static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + struct kernel_hwtstamp_config hwtst_config = {0}; + bool hwprov_mod = false, config_mod = false; + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = req_base->dev; + struct nlattr **tb = info->attrs; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + + if (!netif_device_present(dev)) + return -ENODEV; + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) { + struct hwtstamp_provider_desc __hwprov_desc = {.index = -1}; + struct hwtstamp_provider *__hwprov; + + __hwprov = rtnl_dereference(dev->hwprov); + if (__hwprov) { + __hwprov_desc.index = __hwprov->desc.index; + __hwprov_desc.qualifier = __hwprov->desc.qualifier; + } + + ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + &__hwprov_desc, info->extack, + &hwprov_mod); + if (ret < 0) + return ret; + + if (hwprov_mod) { + hwprov = tsconfig_set_hwprov_from_desc(dev, info, + &__hwprov_desc); + if (IS_ERR(hwprov)) + return PTR_ERR(hwprov); + } + } + + /* Get current hwtstamp config if we are not changing the + * hwtstamp source. It will be zeroed in the other case. + */ + if (!hwprov_mod) { + ret = dev_get_hwtstamp_phylib(dev, &hwtst_config); + if (ret < 0 && ret != -EOPNOTSUPP) + goto err_free_hwprov; + } + + /* Get the hwtstamp config from netlink */ + if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) { + u32 req_tx_type; + + req_tx_type = BIT(hwtst_config.tx_type); + ret = ethnl_update_bitset32(&req_tx_type, + __HWTSTAMP_TX_CNT, + tb[ETHTOOL_A_TSCONFIG_TX_TYPES], + ts_tx_type_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one tx type at a time */ + if (ffs(req_tx_type) != fls(req_tx_type)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.tx_type = ffs(req_tx_type) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) { + u32 req_rx_filter; + + req_rx_filter = BIT(hwtst_config.rx_filter); + ret = ethnl_update_bitset32(&req_rx_filter, + __HWTSTAMP_FILTER_CNT, + tb[ETHTOOL_A_TSCONFIG_RX_FILTERS], + ts_rx_filter_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one rx filter at a time */ + if (ffs(req_rx_filter) != fls(req_rx_filter)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.rx_filter = ffs(req_rx_filter) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { + ethnl_update_u32(&hwtst_config.flags, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + &config_mod); + } + + ret = net_hwtstamp_validate(&hwtst_config); + if (ret) + goto err_free_hwprov; + + if (hwprov_mod) { + struct kernel_hwtstamp_config zero_config = {0}; + struct hwtstamp_provider *__hwprov; + + /* Disable current time stamping if we try to enable + * another one + */ + ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack); + if (ret < 0) + goto err_free_hwprov; + + /* Change the selected hwtstamp source */ + __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov); + if (__hwprov) + kfree_rcu(__hwprov, rcu_head); + } + + if (config_mod) { + ret = dev_set_hwtstamp_phylib(dev, &hwtst_config, + info->extack); + if (ret < 0) + return ret; + } + + if (hwprov_mod || config_mod) { + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; + } + } + + /* tsconfig has no notification */ + return 0; + +err_free_hwprov: + kfree(hwprov); + + return ret; +} + +const struct ethnl_request_ops ethnl_tsconfig_request_ops = { + .request_cmd = ETHTOOL_MSG_TSCONFIG_GET, + .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY, + .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER, + .req_info_size = sizeof(struct tsconfig_req_info), + .reply_data_size = sizeof(struct tsconfig_reply_data), + + .prepare_data = tsconfig_prepare_data, + .reply_size = tsconfig_reply_size, + .fill_reply = tsconfig_fill_reply, + + .set_validate = ethnl_set_tsconfig_validate, + .set = ethnl_set_tsconfig, +}; diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 03d12d6f79ca..7e495a41aeec 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -1,13 +1,18 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> +#include <linux/phy.h> +#include <linux/phy_link_topology.h> +#include <linux/ptp_clock_kernel.h> #include "netlink.h" #include "common.h" #include "bitset.h" +#include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; + struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { @@ -16,34 +21,96 @@ struct tsinfo_reply_data { struct ethtool_ts_stats stats; }; +#define TSINFO_REQINFO(__req_base) \ + container_of(__req_base, struct tsinfo_req_info, base) + #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) -const struct nla_policy ethnl_tsinfo_get_policy[] = { +const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), + [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod) +{ + struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; + int ret; + + ret = nla_parse_nested(tb, + ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, + nest, + ethnl_ts_hwtst_prov_policy, extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || + NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) + return -EINVAL; + + ethnl_update_u32(&hwprov_desc->index, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], + mod); + ethnl_update_u32(&hwprov_desc->qualifier, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], + mod); + + return 0; +} + +static int +tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); + bool mod = false; + + req->hwprov_desc.index = -1; + + if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) + return 0; + + return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], + &req->hwprov_desc, extack, &mod); +} + static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + if (req->hwprov_desc.index != -1) { + ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, + &req->hwprov_desc); + ethnl_ops_complete(dev); + return ret; + } + if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } + ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); @@ -87,8 +154,11 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, return ret; len += ret; /* _TSINFO_RX_FILTERS */ } - if (ts_info->phc_index >= 0) + if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + /* _TSINFO_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); + } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; @@ -163,9 +233,29 @@ static int tsinfo_fill_reply(struct sk_buff *skb, if (ret < 0) return ret; } - if (ts_info->phc_index >= 0 && - nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) - return -EMSGSIZE; + if (ts_info->phc_index >= 0) { + struct nlattr *nest; + + ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, + ts_info->phc_index); + if (ret) + return -EMSGSIZE; + + nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ts_info->phc_index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + ts_info->phc_qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; @@ -173,6 +263,263 @@ static int tsinfo_fill_reply(struct sk_buff *skb, return 0; } +struct ethnl_tsinfo_dump_ctx { + struct tsinfo_req_info *req_info; + struct tsinfo_reply_data *reply_data; + unsigned long pos_ifindex; + bool netdev_dump_done; + unsigned long pos_phyindex; + enum hwtstamp_provider_qualifier pos_phcqualifier; +}; + +static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_reply_data *reply_data, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + void *ehdr = NULL; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TSINFO_GET_REPLY); + if (!ehdr) + return ERR_PTR(-EMSGSIZE); + + reply_data = ctx->reply_data; + memset(reply_data, 0, sizeof(*reply_data)); + reply_data->base.dev = dev; + memset(&reply_data->ts_info, 0, sizeof(reply_data->ts_info)); + + return ehdr; +} + +static int ethnl_tsinfo_end_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_req_info *req_info, + struct tsinfo_reply_data *reply_data, + void *ehdr) +{ + int ret; + + reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); + if (ret < 0) + return ret; + + ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); + if (ret < 0) + return ret; + + reply_data->base.dev = NULL; + genlmsg_end(skb, ehdr); + + return ret; +} + +static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, + struct net_device *dev, + struct phy_device *phydev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!phy_has_tsinfo(phydev)) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) + return PTR_ERR(ehdr); + + ret = phy_ts_info(phydev, &reply_data->ts_info); + if (ret < 0) + goto err; + + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); + if (ret < 0) + goto err; + + return ret; +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!ops->get_ts_info) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; + ctx->pos_phcqualifier++) { + if (!net_support_hwtstamp_qualifier(dev, + ctx->pos_phcqualifier)) + continue; + + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) { + ret = PTR_ERR(ehdr); + goto err; + } + + reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; + ret = ops->get_ts_info(dev, &reply_data->ts_info); + if (ret < 0) + goto err; + + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, + ehdr); + if (ret < 0) + goto err; + } + + return ret; + +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct phy_device_node *pdn; + int ret = 0; + + if (!ctx->netdev_dump_done) { + ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + ctx->netdev_dump_done = true; + } + + if (!dev->link_topo) { + if (phy_has_tsinfo(dev->phydev)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + dev->phydev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + + return 0; + } + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + if (phy_has_tsinfo(pdn->phy)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + pdn->phy, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + } + + return ret; +} + +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + if (ctx->req_info->base.dev) { + ret = ethnl_tsinfo_dump_one_net_topo(skb, + ctx->req_info->base.dev, + cb); + } else { + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + break; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + } + } + rtnl_unlock(); + + return ret; +} + +int ethnl_tsinfo_start(struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct nlattr **tb = info->info.attrs; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + ret = ethnl_parse_header_dev_get(&req_info->base, + tb[ETHTOOL_A_TSINFO_HEADER], + sock_net(cb->skb->sk), cb->extack, + false); + if (ret < 0) + goto free_reply_data; + + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +int ethnl_tsinfo_done(struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_req_info *req_info = ctx->req_info; + + ethnl_parse_header_dev_put(&req_info->base); + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, @@ -180,6 +527,7 @@ const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), + .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8325224ef072..9517b8667e00 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -249,6 +249,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; + if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { + NL_SET_ERR_MSG(extack, + "Flow label cannot be specified for IPv4 FIB rules"); + goto errout; + } + if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 161f5526b86c..d6411ac81096 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2999,7 +2999,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, @@ -3011,7 +3011,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 963a89ae9c26..094084b61bff 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; - int vif; if (!apply_ratelimit) return true; @@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - vif = l3mdev_master_ifindex(dst->dev); - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rcu_read_lock(); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, + l3mdev_master_ifindex_rcu(dst->dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6a238398acc9..8a370ef37d3f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -88,6 +88,8 @@ #include <linux/byteorder/generic.h> #include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/addrconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> @@ -1430,6 +1432,63 @@ static void ip_mc_hash_remove(struct in_device *in_dev, *mc_hash = im->next_hash; } +static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, int event) +{ + struct ifa_cacheinfo ci; + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0); + if (!nlh) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = 32; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + ifm->ifa_index = dev->ifindex; + + ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; + ci.tstamp = ci.cstamp; + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + + if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || + nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static void inet_ifmcaddr_notify(struct net_device *dev, + const struct ip_mc_list *im, int event) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(__be32)), GFP_ATOMIC); + if (!skb) + goto error; + + err = inet_fill_ifmcaddr(skb, dev, im, event); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_ATOMIC); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); +} /* * A socket has joined a multicast group on device dev. @@ -1473,6 +1532,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; + im->mca_cstamp = jiffies; + im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; @@ -1492,6 +1553,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); + inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: @@ -1705,6 +1767,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); + inet_ifmcaddr_notify(in_dev->dev, i, + RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 5ab56f4cb529..e02484f4d22b 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, { struct rb_node **pp, *parent, *next; struct inet_peer *p; + u32 now; pp = &base->rb_root.rb_node; parent = NULL; @@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - if (!refcount_inc_not_zero(&p->refcnt)) - break; + now = jiffies; + if (READ_ONCE(p->dtime) != now) + WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { @@ -150,9 +152,6 @@ static void inet_peer_gc(struct inet_peer_base *base, for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - /* The READ_ONCE() pairs with the WRITE_ONCE() - * in inet_putpeer() - */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) @@ -168,31 +167,23 @@ static void inet_peer_gc(struct inet_peer_base *base, } } +/* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create) + const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; - int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - invalidated = read_seqretry(&base->lock, seq); - rcu_read_unlock(); if (p) return p; - /* If no writer did a change during our lookup, we can return early. */ - if (!create && !invalidated) - return NULL; - /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ @@ -201,12 +192,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); - if (!p && create) { + if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; - refcount_set(&p->refcnt, 2); + refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; @@ -231,15 +222,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - /* The WRITE_ONCE() pairs with itself (we run lockless) - * and the READ_ONCE() in inet_peer_gc() - */ - WRITE_ONCE(p->dtime, (__u32)jiffies); - if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } -EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 07036a2943c1..7a435746a22d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct net *net = q->fqdir->net; - const struct frag_v4_compare_key *key = a; + struct net *net = q->fqdir->net; + struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->fqdir->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : - NULL; + if (q->fqdir->max_dist) { + rcu_read_lock(); + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); + if (p && !refcount_inc_not_zero(&p->refcnt)) + p = NULL; + rcu_read_unlock(); + } + qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0065b1996c94..f45a083f2c13 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1169,7 +1169,10 @@ alloc_new_skb: /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ - if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1213,8 +1216,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1252,7 +1256,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1328,7 +1333,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; - cork->priority = ipc->priority; + cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); @@ -1465,7 +1470,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt); } - skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); + skb->priority = cork->priority; skb->mark = cork->mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cf377377b52d..f6a03b418dde 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; - ipc->priority = rt_tos2priority(ipc->tos); + ipc->sockc.priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0e9e01967ec9..4304a68d1db0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0fbec3509618..9f9d4e6ea1b9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -870,11 +870,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); - rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { + rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; @@ -893,7 +893,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; - goto out_put_peer; + goto out_unlock; } /* Check for load limit; set rate_last to the latest sent @@ -914,8 +914,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } -out_put_peer: - inet_putpeer(peer); +out_unlock: + rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) @@ -975,9 +975,9 @@ static int ip_error(struct sk_buff *skb) break; } + rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev), 1); - + l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; @@ -989,8 +989,9 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; - inet_putpeer(peer); } + rcu_read_unlock(); + if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a79b2a52ce01..42cb5dc9cb24 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; +static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1066,6 +1067,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_tw_reuse_delay", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &tcp_tw_reuse_delay_max, + }, + { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..e45222d5fc2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + u32 reuse_thresh; if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) reuse = 0; @@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); if (ts_recent_stamp && - (!twp || (reuse && time_after32(ktime_get_seconds(), - ts_recent_stamp)))) { + (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ @@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7121d8573928..b089b08e9617 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, rcv_nxt); if (tmp_opt.saw_tstamp) { + u64 ts = tcp_clock_ms(); + + WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, - ktime_get_seconds()); + div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } @@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + /* refreshed when we enter true TIME-WAIT state */ + tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0e765466d7f7..2e2684886953 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5127,22 +5127,6 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - -struct inet6_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; - enum addr_type_t type; -}; - static int inet6_fill_ifaddr(struct sk_buff *skb, const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) @@ -5221,15 +5205,16 @@ error: return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, - const struct ifmcaddr6 *ifmca, - struct inet6_fill_args *args) +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args) { int ifindex = ifmca->idev->dev->ifindex; u8 scope = RT_SCOPE_UNIVERSE; struct nlmsghdr *nlh; - if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + if (!args->force_rt_scope_universe && + ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, @@ -5254,6 +5239,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, nlmsg_end(skb, nlh); return 0; } +EXPORT_SYMBOL(inet6_fill_ifmcaddr); static int inet6_fill_ifacaddr(struct sk_buff *skb, const struct ifacaddr6 *ifaca, @@ -5418,6 +5404,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .flags = NLM_F_MULTI, .netnsid = -1, .type = type, + .force_rt_scope_universe = false, }; struct { unsigned long ifindex; @@ -5546,6 +5533,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .event = RTM_NEWADDR, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; @@ -5617,6 +5605,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) .event = event, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; int err = -ENOBUFS; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index c85c1627cb16..67d39114d9a6 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -26,6 +26,8 @@ struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; + __be32 flowlabel; + __be32 flowlabel_mask; dscp_t dscp; u8 dscp_full:1; /* DSCP or TOS selector */ }; @@ -34,7 +36,7 @@ static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); - if (r->dst.plen || r->src.plen || r->dscp) + if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } @@ -332,6 +334,9 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) return 0; + if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) + return 0; + if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; @@ -360,6 +365,35 @@ static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, return 0; } +static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + __be32 flowlabel, flowlabel_mask; + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || + NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) + return -EINVAL; + + flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); + flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); + + if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], + "Invalid flow label mask"); + return -EINVAL; + } + + if (flowlabel & ~flowlabel_mask) { + NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); + return -EINVAL; + } + + rule6->flowlabel = flowlabel; + rule6->flowlabel_mask = flowlabel_mask; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, @@ -379,6 +413,10 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && + fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) + goto errout; + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); @@ -444,6 +482,14 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_FLOWLABEL] && + nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) + return 0; + + if (tb[FRA_FLOWLABEL_MASK] && + nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) + return 0; + if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; @@ -472,6 +518,11 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->tos = inet_dscp_to_dsfield(rule6->dscp); } + if (rule6->flowlabel_mask && + (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || + nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) + goto nla_put_failure; + if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && @@ -487,7 +538,9 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(4) /* flowlabel */ + + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 071b0bc1179d..a6984a29fdb9 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -222,10 +222,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 9d8422e350f8..28e5a89dc255 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -253,14 +253,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) } static int ioam6_do_inline(struct net *net, struct sk_buff *skb, - struct ioam6_lwt_encap *tuninfo) + struct ioam6_lwt_encap *tuninfo, + struct dst_entry *cache_dst) { struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -291,7 +292,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, bool has_tunsrc, struct in6_addr *tunsrc, - struct in6_addr *tundst) + struct in6_addr *tundst, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr, *inner_hdr; @@ -300,7 +302,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdrlen = (tuninfo->eh.hdrlen + 1) << 3; len = sizeof(*hdr) + hdrlen; - err = skb_cow_head(skb, len + skb->mac_len); + err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -334,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); + struct dst_entry *dst = skb_dst(skb), *cache_dst; struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; @@ -352,6 +354,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) orig_daddr = ipv6_hdr(skb)->daddr; + local_bh_disable(); + cache_dst = dst_cache_get(&ilwt->cache); + local_bh_enable(); + switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: do_inline: @@ -359,7 +365,7 @@ do_inline: if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); if (unlikely(err)) goto drop; @@ -369,7 +375,7 @@ do_encap: /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst); + &ilwt->tundst, cache_dst); if (unlikely(err)) goto drop; @@ -387,41 +393,36 @@ do_encap: goto drop; } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; + if (unlikely(!cache_dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + cache_dst = ip6_route_output(net, NULL, &fl6); + if (cache_dst->error) { + err = cache_dst->error; + dst_release(cache_dst); + goto drop; + } - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { local_bh_disable(); - dst = dst_cache_get(&ilwt->cache); + dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); local_bh_enable(); - if (unlikely(!dst)) { - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = hdr->daddr; - fl6.saddr = hdr->saddr; - fl6.flowlabel = ip6_flowinfo(hdr); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = hdr->nexthdr; - - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - err = dst->error; - dst_release(dst); - goto drop; - } - - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - local_bh_enable(); - } + err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + if (unlikely(err)) + goto drop; + } + if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { skb_dst_drop(skb); - skb_dst_set(skb, dst); - + skb_dst_set(skb, cache_dst); return dst_output(net, sk, skb); } out: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7b4608bb316..d577bf2f3053 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -613,15 +613,15 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -1401,6 +1401,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; + cork->base.priority = ipc6->sockc.priority; sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; @@ -1697,8 +1698,9 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && - getfrag(from, data + transhdrlen, offset, - copy, fraggap, skb) < 0) { + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1742,8 +1744,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1781,7 +1784,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1939,7 +1943,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = cork->base.priority; skb->mark = cork->base.mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b244dbf61d5f..587831c148de 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -33,8 +33,10 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> +#include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> +#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -47,6 +49,7 @@ #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> @@ -901,6 +904,39 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, return mc; } +static void inet6_ifmcaddr_notify(struct net_device *dev, + const struct ifmcaddr6 *ifmca, int event) +{ + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + .force_rt_scope_universe = true, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16), GFP_ATOMIC); + if (!skb) + goto error; + + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_ATOMIC); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); +} + /* * device multicast group inc (add if not found) */ @@ -948,6 +984,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mld_del_delrec(idev, mc); igmp6_group_added(mc); + inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; @@ -977,6 +1014,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) *map = ma->next; igmp6_group_dropped(ma); + inet6_ifmcaddr_notify(idev->dev, ma, + RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); @@ -1021,29 +1060,31 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); - if (idev) { - for_each_mc_rcu(idev, mc) { - if (ipv6_addr_equal(&mc->mca_addr, group)) - break; - } - if (mc) { - if (src_addr && !ipv6_addr_any(src_addr)) { - struct ip6_sf_list *psf; + if (!idev) + goto unlock; + for_each_mc_rcu(idev, mc) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (!mc) + goto unlock; + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; - for_each_psf_rcu(mc, psf) { - if (ipv6_addr_equal(&psf->sf_addr, src_addr)) - break; - } - if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; - else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - } else - rv = true; /* don't filter unspecified source */ + for_each_psf_rcu(mc, psf) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; } + if (psf) + rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || + READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != + READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); + else + rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; + } else { + rv = true; /* don't filter unspecified source */ } +unlock: rcu_read_unlock(); return rv; } @@ -2285,7 +2326,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* source filter not found, or count wrong => bug */ return -ESRCH; } - psf->sf_count[sfmode]--; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; @@ -2391,7 +2432,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, rcu_assign_pointer(pmc->mca_sources, psf); } } - psf->sf_count[sfmode]++; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } @@ -2503,7 +2544,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) - pmc->mca_sfcount[sfmode]++; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); @@ -2514,7 +2556,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int j; if (!delta) - pmc->mca_sfcount[sfmode]--; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { @@ -2559,7 +2602,8 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; - pmc->mca_sfcount[MCAST_EXCLUDE] = 1; + /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ + WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ @@ -3074,8 +3118,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, - psf->sf_count[MCAST_INCLUDE], - psf->sf_count[MCAST_EXCLUDE]); + READ_ONCE(psf->sf_count[MCAST_INCLUDE]), + READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index aba94a348673..d044c67019de 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1731,10 +1731,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); + + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); + if (!ret) goto release; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 88b3fcacd4f9..46b8adf6e7f8 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,6 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8476a3944a88..a45aba090aa4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -619,7 +619,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); @@ -780,6 +780,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 67ff16c04718..78362822b907 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5005,6 +5005,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, + [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5030,6 +5031,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + if (tb[RTA_FLOWLABEL]) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Flow label cannot be specified for this operation"); + goto errout; + } + *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, @@ -6013,6 +6020,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } + if (tb[RTA_FLOWLABEL] && + (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Invalid flow label"); + return -EINVAL; + } + for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; @@ -6027,6 +6041,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: + case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); @@ -6049,6 +6064,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; + __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); @@ -6057,7 +6073,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { @@ -6103,6 +6118,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); + if (iif) { struct net_device *dev; int flags = 0; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index db3c19a42e1c..7ba22d2f2bfe 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -125,7 +125,8 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, - const struct ipv6_rpl_sr_hdr *srh) + const struct ipv6_rpl_sr_hdr *srh, + struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; @@ -153,7 +154,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, hdrlen = ((csrh->hdrlen + 1) << 3); - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; @@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, return 0; } -static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; @@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) tinfo = rpl_encap_lwtunnel(dst->lwtstate); - return rpl_do_srh_inline(skb, rlwt, tinfo->srh); + return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; - local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -237,15 +239,15 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - return dst_output(net, sk, skb); drop: @@ -262,29 +264,31 @@ static int rpl_input(struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; - local_bh_disable(); dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) + goto drop; if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; return dst_input(skb); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 098632adc9b5..4bf937bfc263 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -124,8 +124,8 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, return flowlabel; } -/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(dst->dev); @@ -137,7 +137,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -197,11 +197,18 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return 0; } + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +{ + return __seg6_do_srh_encap(skb, osrh, proto, NULL); +} EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, - struct ipv6_sr_hdr *osrh, int proto) + struct ipv6_sr_hdr *osrh, int proto, + struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); @@ -230,7 +237,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, tot_len = red_hdrlen + sizeof(struct ipv6hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -317,8 +324,8 @@ out: return 0; } -/* insert an SRH within an IPv6 packet, just after the IPv6 header */ -int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -326,7 +333,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -369,9 +376,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -EXPORT_SYMBOL_GPL(seg6_do_srh_inline); -static int seg6_do_srh(struct sk_buff *skb) +static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; @@ -384,7 +390,7 @@ static int seg6_do_srh(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; - err = seg6_do_srh_inline(skb, tinfo->srh); + err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; @@ -402,9 +408,11 @@ static int seg6_do_srh(struct sk_buff *skb) return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, proto); + err = __seg6_do_srh_encap(skb, tinfo->srh, + proto, cache_dst); else - err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); + err = seg6_do_srh_encap_red(skb, tinfo->srh, + proto, cache_dst); if (err) return err; @@ -425,11 +433,13 @@ static int seg6_do_srh(struct sk_buff *skb) skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, - IPPROTO_ETHERNET); + err = __seg6_do_srh_encap(skb, tinfo->srh, + IPPROTO_ETHERNET, + cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, - IPPROTO_ETHERNET); + IPPROTO_ETHERNET, + cache_dst); if (err) return err; @@ -444,6 +454,13 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + return __seg6_do_srh_inline(skb, osrh, NULL); +} +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -458,31 +475,33 @@ static int seg6_input_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, @@ -528,16 +547,16 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -559,15 +578,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d766fd798ecf..7c14c449804c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1448,6 +1448,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d692b902e120..e83691073496 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -73,9 +73,9 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev int ret = l2tp_xmit_skb(session, skb); if (likely(ret == NET_XMIT_SUCCESS)) - dev_sw_netstats_tx_add(dev, 1, len); + dev_dstats_tx_add(dev, len); else - DEV_STATS_INC(dev, tx_dropped); + dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } @@ -84,7 +84,6 @@ static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, }; @@ -100,7 +99,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev) dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) @@ -128,7 +127,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, goto error_rcu; if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) - dev_sw_netstats_rx_add(dev, data_len); + dev_dstats_rx_add(dev, data_len); else DEV_STATS_INC(dev, rx_errors); diff --git a/net/mctp/device.c b/net/mctp/device.c index 26ce34b7e88e..8e0724c56723 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -20,8 +20,7 @@ #include <net/sock.h> struct mctp_dump_cb { - int h; - int idx; + unsigned long ifindex; size_t a_idx; }; @@ -115,43 +114,29 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct mctp_dump_cb *mcb = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct hlist_head *head; struct net_device *dev; struct ifaddrmsg *hdr; struct mctp_dev *mdev; - int ifindex; - int idx = 0, rc; + int ifindex, rc; hdr = nlmsg_data(cb->nlh); // filter by ifindex if requested ifindex = hdr->ifa_index; rcu_read_lock(); - for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { - idx = 0; - head = &net->dev_index_head[mcb->h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx >= mcb->idx && - (ifindex == 0 || ifindex == dev->ifindex)) { - mdev = __mctp_dev_get(dev); - if (mdev) { - rc = mctp_dump_dev_addrinfo(mdev, - skb, cb); - mctp_dev_put(mdev); - // Error indicates full buffer, this - // callback will get retried. - if (rc < 0) - goto out; - } - } - idx++; - // reset for next iteration - mcb->a_idx = 0; - } + for_each_netdev_dump(net, dev, mcb->ifindex) { + if (ifindex && ifindex != dev->ifindex) + continue; + mdev = __mctp_dev_get(dev); + if (!mdev) + continue; + rc = mctp_dump_dev_addrinfo(mdev, skb, cb); + mctp_dev_put(mdev); + if (rc < 0) + break; + mcb->a_idx = 0; } -out: rcu_read_unlock(); - mcb->idx = idx; return skb->len; } @@ -531,9 +516,12 @@ static struct notifier_block mctp_dev_nb = { }; static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = { - {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR, + .doit = mctp_rtm_newaddr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR, + .doit = mctp_rtm_deladdr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR, + .dumpit = mctp_dump_addrinfo}, }; int __init mctp_device_init(void) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7a0f7998376a..98ac73938bd8 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -107,8 +107,8 @@ static void remote_address(const struct sock_common *skc, #endif } -static bool lookup_subflow_by_saddr(const struct list_head *list, - const struct mptcp_addr_info *saddr) +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -1447,8 +1447,8 @@ out_free: return ret; } -static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -1476,7 +1476,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - ret = remove_anno_list_by_saddr(msk, addr); + ret = mptcp_remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); if (ret) { @@ -1520,7 +1520,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, } lock_sock(sk); - remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); + remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); @@ -1633,36 +1633,6 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } -/* Called from the userspace PM only */ -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - int anno_nr = 0; - - list_for_each_entry(entry, rm_list, list) { - if (alist.nr >= MPTCP_RM_IDS_MAX) - break; - - /* only delete if either announced or matching a subflow */ - if (remove_anno_list_by_saddr(msk, &entry->addr)) - anno_nr++; - else if (!lookup_subflow_by_saddr(&msk->conn_list, - &entry->addr)) - continue; - - alist.ids[alist.nr++] = entry->addr.id; - } - - if (alist.nr) { - spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= anno_nr; - mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); - } -} - -/* Called from the in-kernel PM only */ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { @@ -1671,11 +1641,11 @@ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && - remove_anno_list_by_saddr(msk, &entry->addr)) + mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index e35178f5205f..740a10d669f8 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -8,6 +8,10 @@ #include "mib.h" #include "mptcp_pm_gen.h" +#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \ + list_for_each_entry(__entry, \ + &((__msk)->pm.userspace_pm_local_addr_list), list) + void mptcp_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; @@ -26,6 +30,19 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } } +static struct mptcp_pm_addr_entry * +mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_addr_entry *entry; + + mptcp_for_each_userspace_pm_addr(msk, entry) { + if (mptcp_addresses_equal(&entry->addr, addr, false)) + return entry; + } + return NULL; +} + static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry, bool needs_id) @@ -41,7 +58,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, e) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; @@ -90,22 +107,20 @@ append_err: static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { - struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; - list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { - /* TODO: a refcount is needed because the entry can - * be used multiple times (e.g. fullmesh mode). - */ - list_del_rcu(&entry->list); - sock_kfree_s(sk, entry, sizeof(*entry)); - msk->pm.local_addr_used--; - return 0; - } - } - - return -EINVAL; + entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr); + if (!entry) + return -EINVAL; + + /* TODO: a refcount is needed because the entry can + * be used multiple times (e.g. fullmesh mode). + */ + list_del_rcu(&entry->list); + sock_kfree_s(sk, entry, sizeof(*entry)); + msk->pm.local_addr_used--; + return 0; } static struct mptcp_pm_addr_entry * @@ -113,7 +128,7 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (entry->addr.id == id) return entry; } @@ -123,17 +138,12 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { - struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry; + struct mptcp_pm_addr_entry *entry = NULL, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&e->addr, skc, false)) { - entry = e; - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, skc); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; @@ -153,50 +163,60 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; - bool backup = false; + bool backup; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, skc, false)) { - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); spin_unlock_bh(&msk->pm.lock); return backup; } -int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) +static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct mptcp_sock *msk; + + if (!token) { + GENL_SET_ERR_MSG(info, "missing required token"); + return NULL; + } + + msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token)); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return NULL; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + sock_put((struct sock *)msk); + return NULL; + } + + return msk; +} + +int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) +{ struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!addr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!addr) { + GENL_SET_ERR_MSG(info, "missing required address"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto announce_err; - } - err = mptcp_pm_parse_entry(addr, info, true, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "error parsing local address"); @@ -267,40 +287,48 @@ remove_err: return err; } +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_rm_list alist = { .nr = 0 }; + int anno_nr = 0; + + /* only delete if either announced or matching a subflow */ + if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + return; + + alist.ids[alist.nr++] = entry->addr.id; + + spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); +} + int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; - struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; - u32 token_val; u8 id_val; - if (!id || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!id) { + GENL_SET_ERR_MSG(info, "missing required ID"); return err; } id_val = nla_get_u8(id); - token_val = nla_get_u32(token); - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); goto out; @@ -317,16 +345,14 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) goto out; } - list_move(&match->list, &free_list); + list_del_rcu(&match->list); spin_unlock_bh(&msk->pm.lock); - mptcp_pm_remove_addrs(msk, &free_list); + mptcp_pm_remove_addr_entry(msk, match); release_sock(sk); - list_for_each_entry_safe(match, entry, &free_list, list) { - sock_kfree_s(sk, match, sizeof(*match)); - } + sock_kfree_s(sk, match, sizeof(*match)); err = 0; out: @@ -337,7 +363,6 @@ out: int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; @@ -345,28 +370,18 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!laddr || !raddr) { + GENL_SET_ERR_MSG(info, "missing required address(es)"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto create_err; - } - err = mptcp_pm_parse_entry(laddr, info, true, &entry); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); @@ -469,36 +484,25 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct mptcp_addr_info addr_l; + struct mptcp_pm_addr_entry addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!laddr || !raddr) { + GENL_SET_ERR_MSG(info, "missing required address(es)"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto destroy_err; - } - - err = mptcp_pm_parse_addr(laddr, info, &addr_l); + err = mptcp_pm_parse_entry(laddr, info, true, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto destroy_err; @@ -511,43 +515,41 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { - ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6); - addr_l.family = AF_INET6; + if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { + ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6); + addr_l.addr.family = AF_INET6; } - if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) { - ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6); + if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) { + ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_l.addr.addr6); addr_r.family = AF_INET6; } #endif - if (addr_l.family != addr_r.family) { + if (addr_l.addr.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } - if (!addr_l.port || !addr_r.port) { + if (!addr_l.addr.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); err = -EINVAL; goto destroy_err; } lock_sock(sk); - ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); - if (ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_pm_addr_entry entry = { .addr = addr_l }; - - spin_lock_bh(&msk->pm.lock); - mptcp_userspace_pm_delete_local_addr(msk, &entry); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); - mptcp_close_ssk(sk, ssk, subflow); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); - err = 0; - } else { + ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); + if (!ssk) { err = -ESRCH; + goto release_sock; } + + spin_lock_bh(&msk->pm.lock); + mptcp_userspace_pm_delete_local_addr(msk, &addr_l); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk)); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); +release_sock: release_sock(sk); destroy_err: @@ -560,31 +562,19 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct net *net = sock_net(skb->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - u32 token_val; u8 bkup = 0; - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(net, token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "userspace PM not selected"); - goto set_flags_err; - } - ret = mptcp_pm_parse_entry(attr, info, false, &loc); if (ret < 0) goto set_flags_err; @@ -606,13 +596,12 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) bkup = 1; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) { - if (bkup) - entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; - else - entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; - } + entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr); + if (entry) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; } spin_unlock_bh(&msk->pm.lock); @@ -632,33 +621,23 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1); } *bitmap; const struct genl_info *info = genl_info_dump(cb); - struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct nlattr *token; int ret = -EINVAL; struct sock *sk; void *hdr; bitmap = (struct id_bitmap *)cb->ctx; - token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (test_bit(entry->addr.id, bitmap->map)) continue; @@ -680,7 +659,6 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, release_sock(sk); ret = msg->len; -out: sock_put(sk); return ret; } @@ -689,28 +667,19 @@ int mptcp_userspace_pm_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct mptcp_pm_addr_entry addr, *entry; - struct net *net = sock_net(skb->sk); struct mptcp_sock *msk; struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; void *reply; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) goto out; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a93e661ef5c4..cd5132fe7d22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1027,6 +1027,10 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr); +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); @@ -1034,7 +1038,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); void mptcp_free_local_addr_list(struct mptcp_sock *msk); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b66..f8d87d622699 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3126,7 +3126,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ac5caf5a48e1..210b75e3179e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_object.o \ conn_service.o \ input.o \ + input_rack.o \ insecure.o \ io_thread.o \ key.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9d8bd0b37e41..86873399f7d5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -408,9 +408,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9..718193df9d2e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -30,6 +30,7 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; +struct rxrpc_txqueue; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -98,6 +99,7 @@ struct rxrpc_net { atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; @@ -109,6 +111,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + atomic_t stat_tx_jumbo[10]; + atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; @@ -210,9 +214,8 @@ struct rxrpc_skb_priv { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ - u8 nr_acks; /* Number of acks+nacks */ - u8 nr_nacks; /* Number of nacks */ } ack; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ @@ -320,6 +323,12 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + /* Provide a kvec table sufficiently large to manage either a DATA + * packet with a maximum set of jumbo subpackets or a PING ACK padded + * out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -338,25 +347,28 @@ struct rxrpc_peer { time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ - /* calculated RTT cache */ -#define RXRPC_RTT_CACHE_SIZE 32 - spinlock_t rtt_input_lock; /* RTT lock for input routine */ - ktime_t rtt_last_req; /* Time of last RTT request */ - unsigned int rtt_count; /* Number of samples we've got */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + seqcount_t mtu_lock; /* Lockless MTU access management */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_us; /* Retransmission timeout in usec */ - u8 backoff; /* Backoff timeout (as shift) */ + /* Calculated RTT cache */ + unsigned int recent_srtt_us; + unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -525,6 +537,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -557,6 +571,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ + RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ @@ -599,13 +614,25 @@ enum rxrpc_call_state { /* * Call Tx congestion management modes. */ -enum rxrpc_congest_mode { - RXRPC_CALL_SLOW_START, - RXRPC_CALL_CONGEST_AVOIDANCE, - RXRPC_CALL_PACKET_LOSS, - RXRPC_CALL_FAST_RETRANSMIT, - NR__RXRPC_CONGEST_MODES -}; +enum rxrpc_ca_state { + RXRPC_CA_SLOW_START, + RXRPC_CA_CONGEST_AVOIDANCE, + RXRPC_CA_PACKET_LOSS, + RXRPC_CA_FAST_RETRANSMIT, + NR__RXRPC_CA_STATES +} __mode(byte); + +/* + * Current purpose of call RACK timer. According to the RACK-TLP protocol + * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for + * one of these at once. + */ +enum rxrpc_rack_timer_mode { + RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ + RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ + RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ + RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ +} __mode(byte); /* * RxRPC call definition @@ -624,8 +651,7 @@ struct rxrpc_call { struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ - ktime_t ack_lost_at; /* When ACK is figured as lost */ - ktime_t resend_at; /* When next resend needs to happen */ + ktime_t rack_timo_at; /* When ACK is figured as lost */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ @@ -670,21 +696,30 @@ struct rxrpc_call { unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + /* Sendmsg data tracking. */ + rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ + struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ + /* Transmitted data tracking. */ - spinlock_t tx_lock; /* Transmit queue lock */ - struct list_head tx_sendmsg; /* Sendmsg prepared packets */ - struct list_head tx_buffer; /* Buffer of transmissible packets */ + struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ + struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ + rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ - rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ - u8 tx_winsize; /* Maximum size of Tx window */ + u16 tx_nr_sent; /* Number of packets sent, but unacked */ + u16 tx_nr_lost; /* Number of packets marked lost */ + u16 tx_nr_resent; /* Number of packets resent, but unacked */ + u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ @@ -698,14 +733,32 @@ struct rxrpc_call { */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN #define RXRPC_MIN_CWND 4 - u8 cong_cwnd; /* Congestion window size */ + enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ - u8 cong_ssthresh; /* Slow-start threshold */ - enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ - u8 cong_dup_acks; /* Count of ACKs showing missing packets */ - u8 cong_cumul_acks; /* Cumulative ACK count */ + u16 cong_cwnd; /* Congestion window size */ + u16 cong_ssthresh; /* Slow-start threshold */ + u16 cong_dup_acks; /* Count of ACKs showing missing packets */ + u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ + + /* RACK-TLP [RFC8985] state. */ + ktime_t rack_xmit_ts; /* Latest transmission timestamp */ + ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ + ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ + ktime_t rack_reo_wnd; /* Reordering window */ + unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ + int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ + rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ + rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ + rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ + bool rack_dsack_round_none; /* T if dsack_round is "None" */ + bool rack_reordering_seen; /* T if detected reordering event */ + enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ + bool tlp_is_retrans; /* T if unacked TLP retransmission */ + rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ + rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ + unsigned int tlp_rtt_taken; /* Last time RTT taken */ + ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -730,32 +783,45 @@ struct rxrpc_call { /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_seq_t acks_first_seq; /* first sequence number received */ + rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ - rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + unsigned short acks_nr_sacks; /* Number of soft acks recorded */ + unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ + + /* Calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { - u16 nr_acks; /* Number of ACKs in packet */ - u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_new_nacks; /* Number of new nacks in packet */ - u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ - u8 ack_reason; - bool saw_nacks; /* Saw NACKs in packet */ - bool new_low_nack; /* T if new low NACK found */ - bool retrans_timeo; /* T if reTx due to timeout happened */ - u8 flight_size; /* Number of unreceived transmissions */ - /* Place to stash values for tracing */ - enum rxrpc_congest_mode mode:8; - u8 cwnd; - u8 ssthresh; - u8 dup_acks; - u8 cumulative_acks; + rxrpc_serial_t ack_serial; /* Serial number of ACK */ + rxrpc_serial_t acked_serial; /* Serial number ACK'd */ + u16 in_flight; /* Number of unreceived transmissions */ + u16 nr_new_hacks; /* Number of rotated new ACKs */ + u16 nr_new_sacks; /* Number of new soft ACKs in packet */ + u16 nr_new_snacks; /* Number of new soft nacks in packet */ + u8 ack_reason; + bool new_low_snack:1; /* T if new low soft NACK found */ + bool retrans_timeo:1; /* T if reTx due to timeout happened */ + bool need_retransmit:1; /* T if we need transmission */ + bool rtt_sample_avail:1; /* T if RTT sample available */ + bool in_fast_or_rto_recovery:1; + bool exiting_fast_or_rto_recovery:1; + bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ + u8 /*enum rxrpc_congest_change*/ change; }; /* @@ -793,25 +859,23 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ - struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ - unsigned short ack_rwind; /* ACK receive window */ - u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ + bool jumboable; /* Can be non-terminal jumbo subpacket */ u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[3]; + struct kvec kvec[1]; }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) @@ -824,6 +888,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) return !rxrpc_sending_to_server(txb); } +/* + * Transmit queue element, including RACK [RFC8985] per-segment metadata. The + * transmission timestamp is in usec from the base. + */ +struct rxrpc_txqueue { + /* Start with the members we want to prefetch. */ + struct rxrpc_txqueue *next; + ktime_t xmit_ts_base; + rxrpc_seq_t qbase; + u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ + unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ + unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ + unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ + unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ + unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ + + /* The arrays we want to pack into as few cache lines as possible. */ + struct { +#define RXRPC_NR_TXQUEUE BITS_PER_LONG +#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) + struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_serial[RXRPC_NR_TXQUEUE]; + unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; + } ____cacheline_aligned; +}; + +/* + * Data transmission request. + */ +struct rxrpc_send_data_req { + ktime_t now; /* Current time */ + struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ + rxrpc_seq_t seq; /* Sequence of first data */ + int n; /* Number of DATA packets to glue into jumbo */ + bool retrans; /* T if this is a retransmission */ + bool did_send; /* T if did actually send */ + bool tlp_probe; /* T if this is a TLP probe */ + int /* enum rxrpc_txdata_trace */ trace; +}; + #include <trace/events/rxrpc.h> /* @@ -841,6 +945,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn } /* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + +/* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; @@ -865,10 +984,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); - -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_resend_tlp(struct rxrpc_call *call); +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace); +bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c @@ -1047,6 +1166,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* + * input_rack.c + */ +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix); +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks); +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary); +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); +void rxrpc_tlp_send_probe(struct rxrpc_call *call); +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); + +/* Initialise TLP state [RFC8958 7.1]. */ +static inline void rxrpc_tlp_init(struct rxrpc_call *call) +{ + call->tlp_serial = 0; + call->tlp_seq = call->acks_hard_ack; + call->tlp_is_retrans = false; +} + +/* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); @@ -1149,17 +1294,20 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c @@ -1208,10 +1356,12 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); -void rxrpc_peer_init_rtt(struct rxrpc_peer *); +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); +void rxrpc_call_init_rtt(struct rxrpc_call *call); /* * rxkad.c @@ -1284,7 +1434,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); @@ -1311,6 +1460,53 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline u32 earliest(u32 seq1, u32 seq2) +{ + return before(seq1, seq2) ? seq1 : seq2; +} + +static inline u32 latest(u32 seq1, u32 seq2) +{ + return after(seq1, seq2) ? seq1 : seq2; +} + +static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) +{ + return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; +} + +static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); + __skb_queue_tail(&call->rx_queue, skb); + rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); +} + +/* + * Calculate how much space there is for transmitting more DATA packets. + */ +static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) +{ + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int transmitted = call->tx_top - call->tx_bottom; + + return max(winsize - transmitted, 0); +} + +static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) +{ + return call->acks_nr_sacks + call->tx_nr_lost; +} + +/* + * Calculate the number of transmitted DATA packets assumed to be in flight + * [approx RFC6675]. + */ +static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) +{ + return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0f5a1d77b890..e685034ce4f7 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock(&rx->incoming_lock); - spin_unlock(&rx->incoming_lock); + spin_lock_irq(&rx->incoming_lock); + spin_unlock_irq(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; @@ -343,7 +343,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - read_lock(&local->services_lock); + read_lock_irq(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just @@ -399,34 +399,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); if (hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); - rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return true; } diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7bbb68504766..8e477f7f8850 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); - if (call->peer->srtt_us) - delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC; + if (call->srtt_us) + delay = (call->srtt_us >> 3) * NSEC_PER_USEC; else delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); ktime_add_ms(delay, call->tx_backoff); @@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, } /* - * Handle congestion being detected by the retransmit timeout. + * Retransmit one or more packets. */ -static void rxrpc_congestion_timeout(struct rxrpc_call *call) +static bool rxrpc_retransmit_data(struct rxrpc_call *call, + struct rxrpc_send_data_req *req) { - set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); + struct rxrpc_txqueue *tq = req->tq; + unsigned int ix = req->seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[ix]; + + _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); + + req->retrans = true; + trace_rxrpc_retransmit(call, req, txb); + + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_send_data_packet(call, req); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + + req->tq = NULL; + req->n = 0; + req->did_send = true; + req->now = ktime_get_real(); + return true; } /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) +static void rxrpc_resend(struct rxrpc_call *call) { - struct rxrpc_ackpacket *ack = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_txbuf *txb; - rxrpc_seq_t transmitted = call->tx_transmitted; - ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - ktime_t resend_at = KTIME_MAX, now, delay; - bool unacked = false, did_send = false; - unsigned int i; - - _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); - - now = ktime_get_real(); - - if (list_empty(&call->tx_buffer)) - goto no_resend; + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .trace = rxrpc_txdata_retransmit, + }; + struct rxrpc_txqueue *tq; - trace_rxrpc_resend(call, ack_skb); - txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); + _enter("{%d,%d}", call->tx_bottom, call->tx_top); - /* Scan the soft ACK table without dropping the lock and resend any - * explicitly NAK'd packets. - */ - if (ack_skb) { - sp = rxrpc_skb(ack_skb); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + trace_rxrpc_resend(call, call->acks_highest_serial); - for (i = 0; i < sp->ack.nr_acks; i++) { - rxrpc_seq_t seq; + /* Scan the transmission queue, looking for lost packets. */ + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long lost = tq->segment_lost; - if (ack->acks[i] & 1) - continue; - seq = sp->ack.first_ack + i; - if (after(txb->seq, transmitted)) - break; - if (after(txb->seq, seq)) - continue; /* A new hard ACK probably came in */ - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (txb->seq == seq) - goto found_txb; - } - goto no_further_resend; - - found_txb: - resend_at = ktime_add(txb->last_sent, rto); - if (after(txb->serial, call->acks_highest_serial)) { - if (ktime_after(resend_at, now) && - ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; /* Ack point not yet reached */ - } + if (after(tq->qbase, call->tx_transmitted)) + break; - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + _debug("retr %16lx %u c=%08x [%x]", + tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); + _debug("lost %16lx", lost); - trace_rxrpc_retransmit(call, txb->seq, txb->serial, - ktime_sub(resend_at, now)); + trace_rxrpc_resend_lost(call, tq, lost); + while (lost) { + unsigned int ix = __ffs(lost); + struct rxrpc_txbuf *txb = tq->bufs[ix]; - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - now = ktime_get_real(); + __clear_bit(ix, &lost); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost); - if (list_is_last(&txb->call_link, &call->tx_buffer)) - goto no_further_resend; - txb = list_next_entry(txb, call_link); + req.tq = tq; + req.seq = tq->qbase + ix; + req.n = 1; + rxrpc_retransmit_data(call, &req); } } - /* Fast-forward through the Tx queue to the point the peer says it has - * seen. Anything between the soft-ACK table and that point will get - * ACK'd or NACK'd in due course, so don't worry about it here; here we - * need to consider retransmitting anything beyond that point. - */ - if (after_eq(call->acks_prev_seq, call->tx_transmitted)) - goto no_further_resend; - - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - resend_at = ktime_add(txb->last_sent, rto); - - if (before_eq(txb->seq, call->acks_prev_seq)) - continue; - if (after(txb->seq, call->tx_transmitted)) - break; /* Not transmitted yet */ - - if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(txb->serial, ntohl(ack->serial))) - goto do_resend; /* Wasn't accounted for by a more recent ping. */ - - if (ktime_after(resend_at, now)) { - if (ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; - } - - do_resend: - unacked = true; - - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - now = ktime_get_real(); - } + rxrpc_get_rto_backoff(call, req.did_send); + _leave(""); +} -no_further_resend: -no_resend: - if (resend_at < KTIME_MAX) { - delay = rxrpc_get_rto_backoff(call->peer, did_send); - resend_at = ktime_add(resend_at, delay); - trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset); +/* + * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3]. + */ +void rxrpc_resend_tlp(struct rxrpc_call *call) +{ + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted, + .n = 1, + .tlp_probe = true, + .trace = rxrpc_txdata_tlp_retransmit, + }; + + /* There's a chance it'll be on the tail segment of the queue. */ + req.tq = READ_ONCE(call->tx_qtail); + if (req.tq && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; } - call->resend_at = resend_at; - - if (unacked) - rxrpc_congestion_timeout(call); - - /* If there was nothing that needed retransmission then it's likely - * that an ACK got lost somewhere. Send a ping to find out instead of - * retransmitting data. - */ - if (!did_send) { - ktime_t next_ping = ktime_add_us(call->acks_latest_ts, - call->peer->srtt_us >> 3); - if (ktime_sub(next_ping, now) <= 0) - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_0_retrans); + for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) { + if (after_eq(call->tx_transmitted, req.tq->qbase) && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; + } } - - _leave(""); } /* @@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) -{ - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; - - space = wtop - tx_top; - return space > 0; -} - /* - * Decant some if the sendmsg prepared queue into the transmission buffer. + * Transmit some as-yet untransmitted data, to a maximum of the supplied limit. */ -static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { - if (list_empty(&call->tx_sendmsg)) + if (call->send_top == call->tx_top) return; rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - spin_lock(&call->tx_lock); - list_del(&txb->call_link); - spin_unlock(&call->tx_lock); + while (space > 0) { + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted + 1, + .n = 0, + .trace = trace, + }; + struct rxrpc_txqueue *tq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t send_top, seq; + int limit = min(space, max(call->peer->pmtud_jumbo, 1)); + + /* Order send_top before the contents of the new txbufs and + * txqueue pointers + */ + send_top = smp_load_acquire(&call->send_top); + if (call->tx_top == send_top) + break; - call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); + trace_rxrpc_transmit(call, send_top, space); - if (txb->flags & RXRPC_LAST_PACKET) - rxrpc_close_tx_phase(call); + tq = call->tx_qtail; + seq = call->tx_top; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant); - rxrpc_transmit_one(call, txb); + do { + int ix; - if (!rxrpc_tx_window_has_space(call)) - break; + seq++; + ix = seq & RXRPC_TXQ_MASK; + if (!ix) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance); + } + if (!req.tq) + req.tq = tq; + txb = tq->bufs[ix]; + req.n++; + if (!txb->jumboable) + break; + } while (req.n < limit && before(seq, send_top)); + + if (txb->flags & RXRPC_LAST_PACKET) { + rxrpc_close_tx_phase(call); + tq = NULL; + } + call->tx_qtail = tq; + call->tx_top = seq; + + space -= req.n; + rxrpc_send_data_packet(call, &req); } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call) +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: - if (list_empty(&call->tx_sendmsg)) + if (call->tx_bottom == READ_ONCE(call->send_top)) return; rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; - if (list_empty(&call->tx_sendmsg)) { + if (call->tx_bottom == READ_ONCE(call->send_top)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_decant_prepared_tx(call); + rxrpc_transmit_fresh_data(call, limit, trace); break; default: return; @@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) */ static void rxrpc_send_initial_ping(struct rxrpc_call *call) { - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + if (call->rtt_count < 3 || + ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_params); @@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call) { + struct sk_buff *skb; ktime_t now, t; - bool resend = false; + bool did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], call->events); - if (__rxrpc_call_is_complete(call)) - goto out; - /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { @@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) - goto out; + do { + skb = __skb_dequeue(&call->rx_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; + + rxrpc_input_call_packet(call, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + did_receive = true; + } - if (skb) - rxrpc_input_call_packet(call, skb); + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, + rxrpc_timer_trace_rack_off + call->rack_timer_mode); + call->rack_timo_at = KTIME_MAX; + rxrpc_rack_timer_expired(call, t); + } + + } while (!skb_queue_empty(&call->rx_queue)); /* If we see our async-event poke, check for timeout trippage. */ now = ktime_get_real(); @@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_delayed_ack); } - t = ktime_sub(call->ack_lost_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack); - call->ack_lost_at = KTIME_MAX; - set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); - } - t = ktime_sub(call->ping_at, now); if (t <= 0) { trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); @@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - t = ktime_sub(call->resend_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend); - call->resend_at = KTIME_MAX; - resend = true; - } - - rxrpc_transmit_some_data(call); - now = ktime_get_real(); t = ktime_sub(call->keepalive_at, now); if (t <= 0) { @@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) - rxrpc_congestion_degrade(call); - } - if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) rxrpc_send_initial_ping(call); + rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data); + + if (saw_ack) + rxrpc_congestion_degrade(call); + + if (did_receive && + (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST || + __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) { + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + trace_rxrpc_rack(call, t); + } + /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && + if (call->tx_nr_lost > 0 && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) - rxrpc_resend(call, NULL); + rxrpc_resend(call); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); if (call->ackr_nr_unacked > 2) { - if (call->peer->rtt_count < 3) + if (call->rtt_count < 3) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_rtt); - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_old_rtt); @@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) set(call->expect_req_by); set(call->expect_rx_by); set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); + set(call->rack_timo_at); set(call->keepalive_at); set(call->ping_at); @@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { unsigned long nowj = jiffies, delayj, nextj; - delayj = max(nsecs_to_jiffies(delay), 1); + delayj = umax(nsecs_to_jiffies(delay), 1); nextj = nowj + delayj; if (time_before(nextj, call->timer.expires) || !timer_pending(&call->timer)) { @@ -484,9 +474,12 @@ out: rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (did_receive && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f9e983a12c14..5a543c3f6fb0 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -49,7 +49,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) @@ -57,7 +57,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } @@ -146,23 +146,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - INIT_LIST_HEAD(&call->tx_sendmsg); - INIT_LIST_HEAD(&call->tx_buffer); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); - spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; + call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; - call->ack_lost_at = KTIME_MAX; - call->resend_at = KTIME_MAX; + call->rack_timo_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; @@ -177,6 +175,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; + rxrpc_call_init_rtt(call); + call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); @@ -220,9 +220,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(p->timeouts.normal, 1); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(p->timeouts.idle, 1); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; @@ -302,9 +302,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; @@ -434,7 +434,7 @@ error_attached_to_socket: /* * Set up an incoming call. call->conn points to the connection. - * This is called in BH context and isn't allowed to fail. + * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -531,11 +531,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Clean up the Rx skb ring. + * Clean up the transmission buffers. + */ +static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq, *next; + + for (tq = call->tx_queue; tq; tq = next) { + next = tq->next; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (tq->bufs[i]) + rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); + trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); + kfree(tq); + } +} + +/* + * Clean up the receive buffers. */ -static void rxrpc_cleanup_ring(struct rxrpc_call *call) +static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); + rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } @@ -558,7 +576,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -571,7 +589,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -671,23 +689,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu) static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - struct rxrpc_txbuf *txb; del_timer_sync(&call->timer); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - + rxrpc_cleanup_tx_buffers(call); + rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index bb11e8289d6d..db0099197890 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; @@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; @@ -508,16 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; + LIST_HEAD(new_client_calls); - while ((call = list_first_entry_or_null(&local->new_client_calls, - struct rxrpc_call, wait_link)) - ) { + spin_lock_irq(&local->client_call_lock); + list_splice_tail_init(&local->new_client_calls, &new_client_calls); + spin_unlock_irq(&local->client_call_lock); + + while ((call = list_first_entry_or_null(&new_client_calls, + struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; - spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); rxrpc_see_call(call, rxrpc_call_see_waiting_call); - spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); @@ -545,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } } @@ -588,9 +590,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); /* May still be on ->new_client_calls. */ - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); return; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 598b4ee389fc..713e04394ceb 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -26,7 +26,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff bool aborted = false; if (conn->state != RXRPC_CONN_ABORTED) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state != RXRPC_CONN_ABORTED) { conn->abort_code = abort_code; conn->error = err; @@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); aborted = true; } - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } return aborted; @@ -63,11 +63,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -91,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -149,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -158,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - trailer.maxMTU = htonl(rxrpc_rx_mtu); - trailer.ifMTU = htonl(mtu); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); trailer.rwind = htonl(rxrpc_rx_window_size); - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); @@ -171,7 +177,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0, rxrpc_rx_window_size); + pkt.ack.reason, 0, rxrpc_rx_window_size, + rxrpc_propose_ack_retransmit); break; default: @@ -202,11 +209,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); @@ -252,10 +262,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE) { /* Offload call state flipping to the I/O thread. As diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 694c4df7a1a3..7eba4d7d9a38 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) if (WARN_ON_ONCE(!local)) return; - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } @@ -196,9 +196,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { @@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work) list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 16d49a861dbb..4974b5accafa 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -27,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Do TCP-style congestion management [RFC 5681]. + * Do TCP-style congestion management [RFC5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, - struct sk_buff *skb, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t acked_serial) + struct rxrpc_ack_summary *summary) { - enum rxrpc_congest_change change = rxrpc_cong_no_change; - unsigned int cumulative_acks = call->cong_cumul_acks; - unsigned int cwnd = call->cong_cwnd; - bool resend = false; - - summary->flight_size = - (call->tx_top - call->acks_hard_ack) - summary->nr_acks; + summary->change = rxrpc_cong_no_change; + summary->in_flight = rxrpc_tx_in_flight(call); if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = 1; - if (cwnd >= call->cong_ssthresh && - call->cong_mode == RXRPC_CALL_SLOW_START) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; - cumulative_acks = 0; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = 1; + if (call->cong_cwnd >= call->cong_ssthresh && + call->cong_ca_state == RXRPC_CA_SLOW_START) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; + call->cong_cumul_acks = 0; } } - cumulative_acks += summary->nr_new_acks; - if (cumulative_acks > 255) - cumulative_acks = 255; - - summary->cwnd = call->cong_cwnd; - summary->ssthresh = call->cong_ssthresh; - summary->cumulative_acks = cumulative_acks; - summary->dup_acks = call->cong_dup_acks; + call->cong_cumul_acks += summary->nr_new_sacks; + call->cong_cumul_acks += summary->nr_new_hacks; + if (call->cong_cumul_acks > 255) + call->cong_cumul_acks = 255; - switch (call->cong_mode) { - case RXRPC_CALL_SLOW_START: - if (summary->saw_nacks) + switch (call->cong_ca_state) { + case RXRPC_CA_SLOW_START: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; - if (summary->cumulative_acks > 0) - cwnd += 1; - if (cwnd >= call->cong_ssthresh) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + if (call->cong_cumul_acks > 0) + call->cong_cwnd += 1; + if (call->cong_cwnd >= call->cong_ssthresh) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; } goto out; - case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->saw_nacks) + case RXRPC_CA_CONGEST_AVOIDANCE: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_count == 0) + if (call->rtt_count == 0) goto out; - if (ktime_before(skb->tstamp, + if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, - call->peer->srtt_us >> 3))) + call->srtt_us >> 3))) goto out_no_clear_ca; - change = rxrpc_cong_rtt_window_end; - call->cong_tstamp = skb->tstamp; - if (cumulative_acks >= cwnd) - cwnd++; + summary->change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cumul_acks >= call->cong_cwnd) + call->cong_cwnd++; goto out; - case RXRPC_CALL_PACKET_LOSS: - if (!summary->saw_nacks) + case RXRPC_CA_PACKET_LOSS: + if (call->acks_nr_snacks == 0) goto resume_normality; - if (summary->new_low_nack) { - change = rxrpc_cong_new_low_nack; + if (summary->new_low_snack) { + summary->change = rxrpc_cong_new_low_nack; call->cong_dup_acks = 1; if (call->cong_extra > 1) call->cong_extra = 1; @@ -111,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks < 3) goto send_extra_data; - change = rxrpc_cong_begin_retransmission; - call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = call->cong_ssthresh + 3; + summary->change = rxrpc_cong_begin_retransmission; + call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; + summary->in_fast_or_rto_recovery = true; goto out; - case RXRPC_CALL_FAST_RETRANSMIT: - if (!summary->new_low_nack) { - if (summary->nr_new_acks == 0) - cwnd += 1; + case RXRPC_CA_FAST_RETRANSMIT: + rxrpc_tlp_init(call); + summary->in_fast_or_rto_recovery = true; + if (!summary->new_low_snack) { + if (summary->nr_new_sacks == 0) + call->cong_cwnd += 1; call->cong_dup_acks++; if (call->cong_dup_acks == 2) { - change = rxrpc_cong_retransmit_again; + summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; } } else { - change = rxrpc_cong_progress; - cwnd = call->cong_ssthresh; - if (!summary->saw_nacks) + summary->change = rxrpc_cong_progress; + call->cong_cwnd = call->cong_ssthresh; + if (call->acks_nr_snacks == 0) { + summary->exiting_fast_or_rto_recovery = true; goto resume_normality; + } } goto out; @@ -145,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } resume_normality: - change = rxrpc_cong_cleared_nacks; + summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; - call->cong_tstamp = skb->tstamp; - if (cwnd < call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cwnd < call->cong_ssthresh) + call->cong_ca_state = RXRPC_CA_SLOW_START; else - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; out: - cumulative_acks = 0; + call->cong_cumul_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_TX_MAX_WINDOW) - cwnd = RXRPC_TX_MAX_WINDOW; - call->cong_cwnd = cwnd; - call->cong_cumul_acks = cumulative_acks; - summary->mode = call->cong_mode; - trace_rxrpc_congest(call, summary, acked_serial, change); - if (resend) - rxrpc_resend(call, skb); + if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) + call->cong_cwnd = RXRPC_TX_MAX_WINDOW; + trace_rxrpc_congest(call, summary); return; packet_loss_detected: - change = rxrpc_cong_saw_nack; - call->cong_mode = RXRPC_CALL_PACKET_LOSS; + summary->change = rxrpc_cong_saw_nack; + call->cong_ca_state = RXRPC_CA_PACKET_LOSS; call->cong_dup_acks = 0; goto send_extra_data; @@ -177,7 +164,7 @@ send_extra_data: * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->acks_hard_ack) { + call->acks_nr_sacks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -189,26 +176,42 @@ send_extra_data: */ void rxrpc_congestion_degrade(struct rxrpc_call *call) { - ktime_t rtt, now; + ktime_t rtt, now, time_since; - if (call->cong_mode != RXRPC_CALL_SLOW_START && - call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + if (call->cong_ca_state != RXRPC_CA_SLOW_START && + call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) return; if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; - rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); - if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + time_since = ktime_sub(now, call->tx_last_sent); + if (ktime_before(time_since, rtt)) return; - trace_rxrpc_reset_cwnd(call, now); + trace_rxrpc_reset_cwnd(call, time_since, rtt); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; - call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ca_state = RXRPC_CA_SLOW_START; + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); +} + +/* + * Add an RTT sample derived from an ACK'd DATA packet. + */ +static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + int ix) +{ + ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + summary->acked_serial, summary->ack_serial, + xmit_ts, call->acks_latest_ts); + __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ } /* @@ -217,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; - bool rot_last = false; + struct rxrpc_txqueue *tq = call->tx_queue; + rxrpc_seq_t seq = call->tx_bottom + 1; + bool rot_last = false, trace = false; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before_eq(txb->seq, call->acks_hard_ack)) - continue; - if (txb->flags & RXRPC_LAST_PACKET) { + _enter("%x,%x", call->tx_bottom, to); + + trace_rxrpc_tx_rotate(call, seq, to); + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + + if (call->acks_lowest_nak == call->tx_bottom) { + call->acks_lowest_nak = to; + } else if (after(to, call->acks_lowest_nak)) { + summary->new_low_snack = true; + call->acks_lowest_nak = to; + } + + /* We may have a left over fully-consumed buffer at the front that we + * couldn't drop before (rotate_and_keep below). + */ + if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } + + do { + unsigned int ix = seq - call->tx_qbase; + + _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags); + if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if (txb->seq == to) - break; - } - if (rot_last) - set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (summary->acked_serial == tq->segment_serial[ix] && + test_bit(ix, &tq->rtt_samples)) + rxrpc_add_data_rtt_sample(call, summary, tq, ix); + + if (ix == tq->nr_reported_acks) { + /* Packet directly hard ACK'd. */ + tq->nr_reported_acks++; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); + } else if (test_bit(ix, &tq->segment_acked)) { + /* Soft ACK -> hard ACK. */ + call->acks_nr_sacks--; + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack); + } else { + /* Soft NAK -> hard ACK. */ + call->acks_nr_snacks--; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); + } - _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + call->tx_nr_sent--; + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + __clear_bit(ix, &tq->ever_retransmitted); - if (call->acks_lowest_nak == call->acks_hard_ack) { - call->acks_lowest_nak = to; - } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); + tq->bufs[ix] = NULL; + + WRITE_ONCE(call->tx_bottom, seq); + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + + seq++; + trace = true; + if (!(seq & RXRPC_TXQ_MASK)) { + trace_rxrpc_rack_update(call, summary); + trace = false; + prefetch(tq->next); + if (tq != call->tx_qtail) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } else { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep); + tq = NULL; + break; + } + } + + } while (before_eq(seq, to)); + + if (trace) + trace_rxrpc_rack_update(call, summary); + + if (rot_last) { + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (tq) { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + call->tx_queue = NULL; + } } - smp_store_release(&call->acks_hard_ack, to); + _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - trace_rxrpc_txqueue(call, (rot_last ? - rxrpc_txqueue_rotate_last : - rxrpc_txqueue_rotate)); wake_up(&call->waitq); return rot_last; } @@ -263,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); - - if (unlikely(call->cong_last_nack)) { - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + call->rack_timo_at = KTIME_MAX; + trace_rxrpc_rack_timer(call, 0, false); + trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode); switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -365,7 +448,7 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - __skb_queue_tail(&call->recvmsg_queue, skb); + skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) @@ -442,7 +525,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); - spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); *_notify = true; @@ -464,8 +546,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_receive_queue_oos); } - spin_unlock(&call->recvmsg_queue.lock); - call->ackr_sack_base = sack; } else { unsigned int slot; @@ -530,7 +610,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; - int ack_reason = 0; + int ack_reason = 0, count = 1, stat_ix; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -559,12 +639,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->hdr.serial++; offset += RXRPC_JUMBO_SUBPKTLEN; len -= RXRPC_JUMBO_SUBPKTLEN; + count++; } sp->offset = offset; sp->len = len; rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]); + if (ack_reason > 0) { rxrpc_send_ACK(call, ack_reason, ack_serial, rxrpc_propose_ack_input_data); @@ -667,7 +751,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial, sent_at, resp_time); matched = true; } @@ -677,7 +761,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ if (after(acked_serial, orig_serial)) { trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, - orig_serial, acked_serial, 0, 0); + orig_serial, acked_serial, 0, 0, 0); clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); set_bit(i, &call->rtt_avail); @@ -685,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, } if (!matched) - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0); } /* @@ -695,10 +779,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(trailer->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -709,54 +796,149 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, + rxrpc_pmtud_reduce_ack); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_mtu; + write_seqcount_end(&peer->mtu_lock); + } + + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; + + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + capacity = clamp(capacity, 1, jumbo_max); } + call->tx_jumbo_max = capacity; + if (wake) wake_up(&call->waitq); } +#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__) +/* Clang doesn't support the %z constraint modifier */ +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + asm(" shr%z1 %1\n" \ + " inc %0\n" \ + " rcr%z2 %2\n" \ + : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \ + ); \ + }) +#else +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + typeof(rotate_into) __bit0 = *(shift_from) & 1; \ + *(shift_from) >>= 1; \ + shift_from++; \ + rotate_into >>= 1; \ + rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \ + }) +#endif + /* - * Determine how many nacks from the previous ACK have now been satisfied. + * Deal with RTT samples from soft ACKs. */ -static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, - struct rxrpc_ack_summary *summary, - rxrpc_seq_t seq) +static void rxrpc_input_soft_rtt(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq) { - struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->ack.first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) + if (summary->acked_serial == tq->segment_serial[ix]) + return rxrpc_add_data_rtt_sample(call, summary, tq, ix); +} - if (after_eq(seq, old_seq + sp->ack.nr_acks)) { - summary->nr_new_acks += sp->ack.nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks); - summary->nr_retained_nacks = 0; - } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->ack.nr_nacks; - } else { - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_NACK) { - if (before(old_seq + i, seq)) - new_acks++; - else - retained_nacks++; - } +/* + * Process a batch of soft ACKs specific to a transmission queue segment. + */ +static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long extracted_acks, + int nr_reported, + rxrpc_seq_t seq, + rxrpc_seq_t *lowest_nak) +{ + unsigned long old_reported = 0, flipped, new_acks = 0; + unsigned long a_to_n, n_to_a = 0; + int new, a, n; + + if (tq->nr_reported_acks > 0) + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + + _enter("{%x,%lx,%d},%lx,%d,%x", + tq->qbase, tq->segment_acked, tq->nr_reported_acks, + extracted_acks, nr_reported, seq); + + _debug("[%x]", tq->qbase); + _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks); + _debug("sack %16lx %u", extracted_acks, nr_reported); + + /* See how many previously logged ACKs/NAKs have flipped. */ + flipped = (tq->segment_acked ^ extracted_acks) & old_reported; + if (flipped) { + n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */ + a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */ + a = hweight_long(n_to_a); + n = hweight_long(a_to_n); + _debug("flip %16lx", flipped); + _debug("ntoa %16lx %d", n_to_a, a); + _debug("aton %16lx %d", a_to_n, n); + call->acks_nr_sacks += a - n; + call->acks_nr_snacks += n - a; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } + + /* See how many new ACKs/NAKs have been acquired. */ + new = nr_reported - tq->nr_reported_acks; + if (new > 0) { + new_acks = extracted_acks & ~old_reported; + if (new_acks) { + a = hweight_long(new_acks); + n = new - a; + _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n); + call->acks_nr_sacks += a; + call->acks_nr_snacks += n; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } else { + call->acks_nr_snacks += new; + summary->nr_new_snacks += new; } + } + + tq->nr_reported_acks = nr_reported; + tq->segment_acked = extracted_acks; + trace_rxrpc_apply_acks(call, tq); - summary->nr_new_acks += new_acks; - summary->nr_retained_nacks = retained_nacks; + if (extracted_acks != ~0UL) { + rxrpc_seq_t lowest = seq + ffz(extracted_acks); + + if (before(lowest, *lowest_nak)) + *lowest_nak = lowest; } - return old_seq + sp->ack.nr_acks; + if (summary->acked_serial) + rxrpc_input_soft_rtt(call, summary, tq); + + new_acks |= n_to_a; + if (new_acks) + rxrpc_input_rack(call, summary, tq, new_acks); + + if (call->tlp_serial && + rxrpc_seq_in_txq(tq, call->tlp_seq) && + test_bit(call->tlp_seq - tq->qbase, &new_acks)) + summary->tlp_probe_acked = true; } /* @@ -770,39 +952,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct sk_buff *skb, - rxrpc_seq_t seq, - rxrpc_seq_t since) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, old_nacks = 0; + struct rxrpc_txqueue *tq = call->tx_queue; + unsigned long extracted = ~0UL; + unsigned int nr = 0; + rxrpc_seq_t seq = call->acks_hard_ack + 1; rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_ACK) { - summary->nr_acks++; - if (after_eq(seq, since)) - summary->nr_new_acks++; - } else { - summary->saw_nacks = true; - if (before(seq, since)) { - /* Overlap with previous ACK */ - old_nacks++; - } else { - summary->nr_new_nacks++; - sp->ack.nr_nacks++; - } + _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks); + + while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1)) + tq = tq->next; - if (before(seq, lowest_nak)) - lowest_nak = seq; + for (unsigned int i = 0; i < sp->ack.nr_acks; i++) { + /* Decant ACKs until we hit a txqueue boundary. */ + shiftr_adv_rotr(acks, extracted); + if (i == 256) { + acks -= i; + i = 0; } seq++; + nr++; + if ((seq & RXRPC_TXQ_MASK) != 0) + continue; + + _debug("bound %16lx %u", extracted, nr); + + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, + seq - RXRPC_NR_TXQUEUE, &lowest_nak); + extracted = ~0UL; + nr = 0; + tq = tq->next; + prefetch(tq); } - if (lowest_nak != call->acks_lowest_nak) { - call->acks_lowest_nak = lowest_nak; - summary->new_low_nack = true; + if (nr) { + unsigned int nr_reported = seq & RXRPC_TXQ_MASK; + + extracted >>= RXRPC_NR_TXQUEUE - nr_reported; + _debug("tail %16lx %u", extracted, nr_reported); + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported, + seq & ~RXRPC_TXQ_MASK, &lowest_nak); } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -810,9 +1003,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_nacks) - summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; - summary->nr_retained_nacks = old_nacks; + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_snack = true; + } + + _debug("summary A=%d+%d N=%d+%d", + call->acks_nr_sacks, summary->nr_new_sacks, + call->acks_nr_snacks, summary->nr_new_snacks); } /* @@ -820,21 +1018,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * with respect to the ack state conveyed by preceding ACKs. */ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, - rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt) { - rxrpc_seq_t base = READ_ONCE(call->acks_first_seq); + rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack); - if (after(first_pkt, base)) + if (after(hard_ack, base)) return true; /* The window advanced */ - if (before(first_pkt, base)) + if (before(hard_ack, base)) return false; /* firstPacket regressed */ if (after_eq(prev_pkt, call->acks_prev_seq)) return true; /* previousPacket hasn't regressed. */ /* Some rx implementations put a serial number in previousPacket. */ - if (after_eq(prev_pkt, base + call->tx_winsize)) + if (after(prev_pkt, base + call->tx_winsize)) return false; return true; } @@ -852,53 +1050,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_acktrailer trailer; - rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - ack_serial = sp->hdr.serial; - acked_serial = sp->ack.acked_serial; - first_soft_ack = sp->ack.first_ack; - prev_pkt = sp->ack.prev_ack; - nr_acks = sp->ack.nr_acks; - hard_ack = first_soft_ack - 1; - summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? - sp->ack.reason : RXRPC_ACK__INVALID); - - trace_rxrpc_rx_ack(call, ack_serial, acked_serial, - first_soft_ack, prev_pkt, - summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + summary.ack_serial = sp->hdr.serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); - if (acked_serial != 0) { - switch (summary.ack_reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_other_ack); - break; - } - } + trace_rxrpc_rx_ack(call, sp); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + prefetch(call->tx_queue); /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, @@ -911,9 +1090,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * if we still have it buffered to the beginning. */ if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && - call->acks_hard_ack == 0 && + call->tx_bottom == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -921,11 +1100,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); - goto send_response; + if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { + trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); + goto send_response; /* Still respond if requested. */ } trailer.maxMTU = 0; @@ -937,34 +1114,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); - if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } else { - summary.nr_new_acks = first_soft_ack - call->acks_first_seq; - call->acks_lowest_nak = first_soft_ack + nr_acks; - since = first_soft_ack; - } - - call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; + call->acks_latest_ts = ktime_get_real(); + call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; - switch (summary.ack_reason) { - case RXRPC_ACK_PING: - break; - default: - if (acked_serial && after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; - break; + if (summary.acked_serial) { + switch (summary.ack_reason) { + case RXRPC_ACK_PING_RESPONSE: + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, summary.ack_serial, + rxrpc_rtt_rx_ping_response); + break; + default: + if (after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; + summary.rtt_sample_avail = true; + break; + } } /* Parse rwind and mtu sizes if provided. */ if (trailer.maxMTU) rxrpc_input_ack_trailer(call, skb, &trailer); - if (first_soft_ack == 0) + if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ @@ -978,13 +1151,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - if (before(hard_ack, call->acks_hard_ack) || + if (before(hard_ack, call->tx_bottom) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); - if (after(hard_ack, call->acks_hard_ack)) { + if (after(hard_ack, call->tx_bottom)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; @@ -994,25 +1167,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); - rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); - call->cong_last_nack = skb; + rxrpc_input_soft_acks(call, &summary, skb); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - summary.nr_acks == call->tx_top - hard_ack && + call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); - rxrpc_congestion_management(call, skb, &summary, acked_serial); + /* Drive the congestion management algorithm first and then RACK-TLP as + * the latter depends on the state/change in state in the former. + */ + rxrpc_congestion_management(call, &summary); + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + rxrpc_tlp_process_ack(call, &summary); + if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial)) + call->tlp_serial = 0; send_response: if (summary.ack_reason == RXRPC_ACK_PING) - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial, rxrpc_propose_ack_respond_to_ack); } @@ -1111,5 +1289,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) break; } - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); } diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c new file mode 100644 index 000000000000..13c371261e0a --- /dev/null +++ b/net/rxrpc/input_rack.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RACK-TLP [RFC8958] Implementation + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, + ktime_t t2, rxrpc_seq_t seq2) +{ + if (ktime_after(t1, t2)) + return true; + return t1 == t2 && after(seq1, seq2); +} + +/* + * Mark a packet lost. + */ +static void rxrpc_rack_mark_lost(struct rxrpc_call *call, + struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (__test_and_set_bit(ix, &tq->segment_lost)) { + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + } else { + call->tx_nr_lost++; + } + tq->segment_xmit_ts[ix] = UINT_MAX; +} + +/* + * Get the transmission time of a packet in the Tx queue. + */ +static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (tq->segment_xmit_ts[ix] == UINT_MAX) + return KTIME_MAX; + return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); +} + +/* + * Get a bitmask of nack bits for a queue segment and mask off any that aren't + * yet reported. + */ +static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) +{ + unsigned long nacks = ~tq->segment_acked; + + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + nacks &= (1UL << tq->nr_reported_acks) - 1; + return nacks; +} + +/* + * Update the RACK state for the most recently sent packet that has been + * delivered [RFC8958 6.2 Step 2]. + */ +static void rxrpc_rack_update(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); + + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + + if (test_bit(ix, &tq->segment_retransmitted)) { + /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ + if (before(call->acks_highest_serial, tq->segment_serial[ix])) + return; + if (rtt < minmax_get(&call->min_rtt)) + return; + } + + /* The RACK algorithm requires the segment ACKs to be traversed in + * order of segment transmission - but the only thing this seems to + * matter for is that RACK.rtt is set to the rtt of the most recently + * transmitted segment. We should be able to achieve the same by only + * setting RACK.rtt if the xmit time is greater. + */ + if (ktime_after(xmit_ts, call->rack_rtt_ts)) { + call->rack_rtt = rtt; + call->rack_rtt_ts = xmit_ts; + } + + if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { + call->rack_rtt = rtt; + call->rack_xmit_ts = xmit_ts; + call->rack_end_seq = seq; + } +} + +/* + * Detect data segment reordering [RFC8958 6.2 Step 3]. + */ +static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + + /* Track the highest sequence number so far ACK'd. This is not + * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer + * could put a NACK in the last SACK slot. + */ + if (after(seq, call->rack_fack)) + call->rack_fack = seq; + else if (before(seq, call->rack_fack) && + test_bit(ix, &tq->segment_retransmitted)) + call->rack_reordering_seen = true; +} + +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_rack_update(call, summary, tq, ix); + rxrpc_rack_detect_reordering(call, summary, tq, ix); +} + +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks) +{ + while (new_acks) { + unsigned int ix = __ffs(new_acks); + + __clear_bit(ix, &new_acks); + rxrpc_input_rack_one(call, summary, tq, ix); + } + + trace_rxrpc_rack_update(call, summary); +} + +/* + * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated + * duration of the reordering window. + * + * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can + * be given a 'DUPLICATE' reason with the serial number referring to the + * duplicated DATA packet. Rx does not inform as to whether this was a + * reception of the same packet twice or of a retransmission of a packet we + * already received (though this could be determined by the transmitter based + * on the serial number). + */ +static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ + bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; + int dup_thresh = 3; + + /* DSACK-based reordering window adaptation */ + if (!call->rack_dsack_round_none && + after_eq(snd_una, call->rack_dsack_round)) + call->rack_dsack_round_none = true; + + /* Grow the reordering window per round that sees DSACK. Reset the + * window after 16 DSACK-free recoveries. + */ + if (call->rack_dsack_round_none && have_dsack_option) { + call->rack_dsack_round_none = false; + call->rack_dsack_round = snd_nxt; + call->rack_reo_wnd_mult++; + call->rack_reo_wnd_persist = 16; + } else if (summary->exiting_fast_or_rto_recovery) { + call->rack_reo_wnd_persist--; + if (call->rack_reo_wnd_persist <= 0) + call->rack_reo_wnd_mult = 1; + } + + if (!call->rack_reordering_seen) { + if (summary->in_fast_or_rto_recovery) + return 0; + if (call->acks_nr_sacks >= dup_thresh) + return 0; + } + + return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, + call->srtt_us >> 3)); +} + +/* + * Detect losses [RFC8958 6.2 Step 5]. + */ +static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + struct rxrpc_txqueue *tq; + ktime_t timeout = 0, lost_after, now = ktime_get_real(); + + call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); + lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + trace_rxrpc_rack_scan_loss(call); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long nacks = rxrpc_tq_nacks(tq); + + if (after(tq->qbase, call->tx_transmitted)) + break; + trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); + + /* Skip ones marked lost but not yet retransmitted */ + nacks &= ~tq->segment_lost | tq->segment_retransmitted; + + while (nacks) { + unsigned int ix = __ffs(nacks); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t remaining; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + __clear_bit(ix, &nacks); + + if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, + xmit_ts, seq)) { + remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); + if (remaining <= 0) { + rxrpc_rack_mark_lost(call, tq, ix); + trace_rxrpc_rack_detect_loss(call, summary, seq); + } else { + timeout = max(remaining, timeout); + } + } + } + } + + return timeout; +} + +/* + * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. + */ +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + ktime_t timeout = rxrpc_rack_detect_loss(call, summary); + + if (timeout) { + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; + call->rack_timo_at = ktime_add(ktime_get_real(), timeout); + trace_rxrpc_rack_timer(call, timeout, false); + trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); + } +} + +/* + * Handle RACK-TLP RTO expiration [RFC8958 6.3]. + */ +static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long unacked = ~tq->segment_acked; + + trace_rxrpc_rack_mark_loss_tq(call, tq); + while (unacked) { + unsigned int ix = __ffs(unacked); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + if (after(seq, call->tx_transmitted)) + return; + __clear_bit(ix, &unacked); + + if (seq == snd_una || + ktime_before(xmit_ts, deadline)) + rxrpc_rack_mark_lost(call, tq, ix); + } + } +} + +/* + * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. + */ +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) +{ + unsigned int flight_size = rxrpc_tx_in_flight(call); + ktime_t rto_at = ktime_add(call->tx_last_sent, + rxrpc_get_rto_backoff(call, false)); + ktime_t pto; + + if (call->rtt_count > 0) { + /* Use 2*SRTT as the timeout. */ + pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); + if (flight_size) + pto = ktime_add(pto, call->tlp_max_ack_delay); + } else { + pto = NSEC_PER_SEC; + } + + if (ktime_after(ktime_add(now, pto), rto_at)) + pto = ktime_sub(rto_at, now); + return pto; +} + +/* + * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. + */ +void rxrpc_tlp_send_probe(struct rxrpc_call *call) +{ + unsigned int in_flight = rxrpc_tx_in_flight(call); + + if (after_eq(call->acks_hard_ack, call->tx_transmitted)) + return; /* Everything we transmitted has been acked. */ + + /* There must be no other loss probe still in flight and we need to + * have taken a new RTT sample since last probe or the start of + * connection. + */ + if (!call->tlp_serial && + call->tlp_rtt_taken != call->rtt_taken) { + call->tlp_is_retrans = false; + if (after(call->send_top, call->tx_transmitted) && + rxrpc_tx_window_space(call) > 0) { + /* Transmit the lowest-sequence unsent DATA */ + call->tx_last_serial = 0; + rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); + call->tlp_serial = call->tx_last_serial; + call->tlp_seq = call->tx_transmitted; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); + in_flight = rxrpc_tx_in_flight(call); + } else { + /* Retransmit the highest-sequence DATA sent */ + call->tx_last_serial = 0; + rxrpc_resend_tlp(call); + call->tlp_is_retrans = true; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); + } + } else { + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); + } + + if (in_flight != 0) { + ktime_t rto = rxrpc_get_rto_backoff(call, false); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; + call->rack_timo_at = ktime_add(ktime_get_real(), rto); + trace_rxrpc_rack_timer(call, rto, false); + trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); + } +} + +/* + * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. + */ +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) +{ + if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) + return; + + if (!call->tlp_is_retrans) { + /* TLP of new data delivered */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); + call->tlp_serial = 0; + } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && + summary->acked_serial == call->tlp_serial) { + /* General Case: Detected packet losses using RACK [7.4.1] */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); + call->tlp_serial = 0; + } else if (after(call->acks_hard_ack, call->tlp_seq)) { + /* Repaired the single loss */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); + call->tlp_serial = 0; + // TODO: Invoke congestion control to react to the loss + // event the probe has repaired + } else if (summary->tlp_probe_acked) { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); + /* Special Case: Detected a single loss repaired by the loss + * probe [7.4.2] + */ + call->tlp_serial = 0; + } else { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); + } +} + +/* + * Handle RACK timer expiration; returns true to request a resend. + */ +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) +{ + struct rxrpc_ack_summary summary = {}; + enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; + + trace_rxrpc_rack_timer(call, overran_by, true); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + + switch (mode) { + case RXRPC_CALL_RACKTIMER_RACK_REORDER: + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + break; + case RXRPC_CALL_RACKTIMER_TLP_PTO: + rxrpc_tlp_send_probe(call); + break; + case RXRPC_CALL_RACKTIMER_RTO: + // Might need to poke the congestion algo in some way + rxrpc_rack_mark_losses_on_rto(call); + break; + //case RXRPC_CALL_RACKTIMER_ZEROWIN: + default: + pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + } +} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 6716c021a532..e068f9b79d02 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 07c74c77d802..64f8d77b8731 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; - bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, @@ -364,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; @@ -419,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, peer_srx, skb); } - ret = rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return ret; + return true; } /* @@ -438,6 +443,8 @@ int rxrpc_io_thread(void *data) ktime_t now; #endif bool should_stop; + LIST_HEAD(conn_attend_q); + LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); @@ -448,43 +455,26 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); - /* Deal with connections that want immediate attention. */ - conn = list_first_entry_or_null(&local->conn_attend_q, - struct rxrpc_connection, - attend_link); - if (conn) { - spin_lock_bh(&local->lock); - list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); - - rxrpc_input_conn_event(conn, NULL); - rxrpc_put_connection(conn, rxrpc_conn_put_poke); - continue; + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); } +#endif - if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, - &local->client_conn_flags)) - rxrpc_discard_expired_client_conns(local); - - /* Deal with calls that want immediate attention. */ - if ((call = list_first_entry_or_null(&local->call_attend_q, - struct rxrpc_call, - attend_link))) { - spin_lock_bh(&local->lock); - list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); - - trace_rxrpc_call_poked(call); - rxrpc_input_call_event(call, NULL); - rxrpc_put_call(call, rxrpc_call_put_poke); - continue; + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); + trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } - if (!list_empty(&local->new_client_calls)) - rxrpc_connect_client_calls(local); - - /* Process received packets and errors. */ - if ((skb = __skb_dequeue(&rx_queue))) { + /* Distribute packets and errors. */ + while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: @@ -508,27 +498,46 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } - continue; } - /* Inject a delay into packets if requested. */ -#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY - now = ktime_get_real(); - while ((skb = skb_peek(&local->rx_delay_queue))) { - if (ktime_before(now, skb->tstamp)) - break; - skb = skb_dequeue(&local->rx_delay_queue); - skb_queue_tail(&local->rx_queue, skb); + /* Deal with connections that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + + while ((conn = list_first_entry_or_null(&conn_attend_q, + struct rxrpc_connection, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_irq(&local->lock); + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); } -#endif - if (!skb_queue_empty(&local->rx_queue)) { - spin_lock_irq(&local->rx_queue.lock); - skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); - spin_unlock_irq(&local->rx_queue.lock); - continue; + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + + /* Deal with calls that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->call_attend_q, &call_attend_q); + spin_unlock_irq(&local->lock); + + while ((call = list_first_entry_or_null(&call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_irq(&local->lock); + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call); + rxrpc_put_call(call, rxrpc_call_put_poke); } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || @@ -558,7 +567,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 2792d2304605..a74a4b43904f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); - - /* We want receive timestamps. */ - sock_enable_timestamps(usk); break; default: diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 657cf35089a6..8fcc8139d771 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5ea9601efd05..6f7a125d6e90 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) } /* + * Allocate transmission buffers for an ACK and attach them to local->kv[]. + */ +static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct kvec *kv = call->local->kvec; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) + return -ENOMEM; + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + return -ENOMEM; + } + } + + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + kv[0].iov_base = whdr; + kv[0].iov_len = sizeof(*whdr) + sizeof(*ack); + kv[1].iov_base = buf2; + kv[1].iov_len = sack_size; + kv[2].iov_base = filler; + kv[2].iov_len = 3 + sizeof(*trailer); + return 3; /* Number of kvec[] used. */ +} + +static void rxrpc_free_ack(struct rxrpc_call *call) +{ + page_frag_free(call->local->kvec[0].iov_base); + if (call->local->kvec[1].iov_base) + page_frag_free(call->local->kvec[1].iov_base); +} + +/* + * Record the beginning of an RTT probe. + */ +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) +{ + unsigned long avail = call->rtt_avail; + int rtt_slot = 9; + + if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) + goto no_slot; + + rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); + if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) + goto no_slot; + + call->rtt_serial[rtt_slot] = serial; + call->rtt_sent_at[rtt_slot] = now; + smp_wmb(); /* Write data before avail bit */ + set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + + trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); + return; + +no_slot: + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); +} + +/* * Fill out an ACK packet. */ -static void rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u8 ack_reason, - rxrpc_serial_t serial) +static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, + rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); - unsigned int qsize, sack, wrap, to; + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; + ktime_t now = ktime_get_real(); int rsize; - u32 mtu, jmax; - u8 *filler = txb->kvec[2].iov_base; - u8 *sackp = txb->kvec[1].iov_base; + u8 *filler = kv[2].iov_base; + u8 *sackp = kv[1].iov_base; rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); @@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, wtop = call->ackr_wtop; sack = call->ackr_sack_base % RXRPC_SACK_SIZE; + *_ack_serial = rxrpc_get_next_serial(call->conn); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->serial = htonl(*_ack_serial); whdr->seq = 0; whdr->type = RXRPC_PACKET_TYPE_ACK; - txb->flags |= RXRPC_SLOW_START_OK; + whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + ack->bufferSpace = 0; ack->maxSkew = 0; ack->firstPacket = htonl(window); ack->previousPacket = htonl(call->rx_highest_seq); - ack->serial = htonl(serial); + ack->serial = htonl(serial_to_ack); ack->reason = ack_reason; ack->nAcks = wtop - window; filler[0] = 0; @@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, filler[2] = 0; if (ack_reason == RXRPC_ACK_PING) - txb->flags |= RXRPC_REQUEST_ACK; + whdr->flags |= RXRPC_REQUEST_ACK; if (after(wtop, window)) { - txb->len += ack->nAcks; - txb->kvec[1].iov_base = sackp; - txb->kvec[1].iov_len = ack->nAcks; + kv[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); + to = umin(ack->nAcks, RXRPC_SACK_SIZE); if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); @@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, ack->reason = RXRPC_ACK_IDLE; } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); - txb->ack_rwind = rsize; - trailer->maxMTU = htonl(rxrpc_rx_mtu); - trailer->ifMTU = htonl(mtu); - trailer->rwind = htonl(rsize); - trailer->jumbo_max = htonl(jmax); -} - -/* - * Record the beginning of an RTT probe. - */ -static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - ktime_t now, enum rxrpc_rtt_tx_trace why) -{ - unsigned long avail = call->rtt_avail; - int rtt_slot = 9; - - if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) - goto no_slot; - - rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); - if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) - goto no_slot; - call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = now; - smp_wmb(); /* Write data before avail bit */ - set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; + } - trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return; + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); + trailer->rwind = htonl(rsize); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ -no_slot: - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); + if (ack_reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); + if (whdr->flags & RXRPC_REQUEST_ACK) + call->rtt_last_req = now; + rxrpc_set_keepalive(call, now); + return nr_kv; } /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - ktime_t now; int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - - txb->serial = rxrpc_get_next_serial(conn); - whdr->serial = htonl(txb->serial); - trace_rxrpc_tx_ack(call->debug_id, txb->serial, + trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(ack->firstPacket), ntohl(ack->serial), ack->reason, ack->nAcks, - txb->ack_rwind); + ntohl(trailer->rwind), why); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); - rxrpc_local_dont_fragment(conn->local, false); - ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { - trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, + trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - now = ktime_get_real(); - if (ack->reason == RXRPC_ACK_PING) - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping); - if (txb->flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = now; - rxrpc_set_keepalive(call, now); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); } @@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * Queue an ACK for immediate transmission. */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) + rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why) { - struct rxrpc_txbuf *txb; + struct kvec *kv = call->local->kvec; + rxrpc_serial_t ack_serial; + size_t len; + int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window); - if (!txb) { + nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window); + if (nr_kv < 0) { kleave(" = -ENOMEM"); return; } - txb->ack_why = why; + nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial); + len = kv[0].iov_len; + len += kv[1].iov_len; + len += kv[2].iov_len; + + /* Extend a path MTU probe ACK. */ + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (len > probe_mtu) + goto skip; + while (len < probe_mtu) { + size_t part = umin(probe_mtu - len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + len += part; + nr_kv++; + } + } - rxrpc_fill_out_ack(call, txb, ack_reason, serial); call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + trace_rxrpc_send_ack(call, why, ack_reason, ack_serial); + rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why); +skip: + rxrpc_free_ack(call); +} + +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); } /* @@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, + rxrpc_serial_t serial, int subpkt) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + struct kvec *kv = &call->local->kvec[subpkt]; + size_t len = txb->pkt_len; + bool last; + u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); txb->serial = serial; @@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t txb->seq == 1) whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; + + if (subpkt < req->n - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -346,113 +463,188 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && txb->seq & 1) + else if (call->rtt_count < 3) why = rxrpc_reqack_more_rtt; - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) - txb->flags |= RXRPC_REQUEST_ACK; + if (why != rxrpc_reqack_no_srv_last) { + flags |= RXRPC_REQUEST_ACK; + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); + call->rtt_last_req = req->now; + } dont_set_request_ack: - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - whdr->serial = htonl(txb->serial); - whdr->cksum = txb->cksum; + /* The jumbo header overlays the wire header in the txbuf. */ + if (subpkt < req->n - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->serial = htonl(txb->serial); + whdr->cksum = txb->cksum; + whdr->serviceId = htons(conn->service_id); + kv->iov_base = whdr; + len += sizeof(*whdr); + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace); + kv->iov_len = len; + return len; } /* - * Prepare a packet for transmission. + * Prepare a transmission queue object for initial transmission. Returns the + * number of microseconds since the transmission queue base timestamp. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, + struct rxrpc_send_data_req *req) { - rxrpc_serial_t serial; - - /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(call->conn); - - rxrpc_prepare_data_subpacket(call, txb, serial); - - return txb->len; + if (!tq) + return 0; + if (tq->xmit_ts_base == KTIME_MIN) { + tq->xmit_ts_base = req->now; + return 0; + } + return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base)); } /* - * Set timeouts after transmitting a packet. + * Prepare a (jumbo) packet for transmission. */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { - ktime_t now = ktime_get_real(); - bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + struct rxrpc_txqueue *tq = req->tq; + rxrpc_serial_t serial; + unsigned int xmit_ts; + rxrpc_seq_t seq = req->seq; + size_t len = 0; + bool start_tlp = false; - call->tx_last_sent = now; - txb->last_sent = now; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); - if (ack_requested) { - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + /* Each transmission of a Tx packet needs a new serial number */ + serial = rxrpc_get_next_serials(call->conn, req->n); + + call->tx_last_serial = serial + req->n - 1; + call->tx_last_sent = req->now; + xmit_ts = rxrpc_prepare_txqueue(tq, req); + prefetch(tq->next); + + for (int i = 0;;) { + int ix = seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + + /* Record (re-)transmission for RACK [RFC8985 6.1]. */ + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (req->retrans) { + __set_bit(ix, &tq->ever_retransmitted); + __set_bit(ix, &tq->segment_retransmitted); + call->tx_nr_resent++; + } else { + call->tx_nr_sent++; + start_tlp = true; + } + tq->segment_xmit_ts[ix] = xmit_ts; + tq->segment_serial[ix] = serial; + if (i + 1 == req->n) + /* Only sample the last subpacket in a jumbo. */ + __set_bit(ix, &tq->rtt_samples); + len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); + serial++; + seq++; + i++; + if (i >= req->n) + break; + if (!(seq & RXRPC_TXQ_MASK)) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance); + xmit_ts = rxrpc_prepare_txqueue(tq, req); + } + } - call->peer->rtt_last_req = now; - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + /* Set timeouts */ + if (req->tlp_probe) { + /* Sending TLP loss probe [RFC8985 7.3]. */ + call->tlp_serial = serial - 1; + call->tlp_seq = seq - 1; + } else if (start_tlp) { + /* Schedule TLP loss probe [RFC8985 7.2]. */ + ktime_t pto; + + if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + /* The first packet may take longer to elicit a response. */ + pto = NSEC_PER_SEC; + else + pto = rxrpc_tlp_calc_pto(call, req->now); - call->ack_lost_at = ktime_add(now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO; + call->rack_timo_at = ktime_add(req->now, pto); + trace_rxrpc_rack_timer(call, pto, false); + trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto); } if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); - call->expect_rx_by = ktime_add(now, delay); + call->expect_rx_by = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } - rxrpc_set_keepalive(call, now); + rxrpc_set_keepalive(call, req->now); + return len; } /* - * send a packet through the transport endpoint + * Send one or more packets through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; + struct rxrpc_txqueue *tq = req->tq; + struct rxrpc_txbuf *txb; struct msghdr msg; + rxrpc_seq_t seq = req->seq; size_t len; - int ret; + bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); + int ret, stat_ix; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); - len = rxrpc_prepare_data_packet(call, txb); + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - ret = 0; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, - txb->flags, true); - goto done; - } - } + len = rxrpc_prepare_data_packet(call, req); + txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -460,16 +652,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - /* Track what we've attempted to transmit at least once so that the - * retransmission algorithm doesn't try to resend what we haven't sent - * yet. + /* Send the packet with the don't fragment bit set unless we think it's + * too big or if this is a retransmission. */ - if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq; - - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (txb->len >= call->peer->maxdata) { + if (seq == call->tx_transmitted + 1 && + len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -477,7 +664,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t frag = rxrpc_tx_point_call_data_nofrag; } -retry: + /* Track what we've attempted to transmit at least once so that the + * retransmission algorithm doesn't try to resend what we haven't sent + * yet. + */ + if (seq == call->tx_transmitted + 1) + call->tx_transmitted = seq + req->n - 1; + + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + + if ((lose++ & 7) == 7) { + ret = 0; + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, + rxrpc_txdata_inject_loss); + conn->peer->last_tx_at = ktime_get_seconds(); + goto done; + } + } + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -488,36 +693,35 @@ retry: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) { + if (ret == -EMSGSIZE) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); + trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + ret = 0; + } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { - rxrpc_local_dont_fragment(conn->local, false); - frag = rxrpc_tx_point_call_data_frag; - goto retry; - } -done: - if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb); - } else { - /* Cancel the call if the initial transmission fails, - * particularly if that's due to network routing issues that - * aren't going away anytime soon. The layer above can arrange - * the retransmission. + if (ret < 0) { + /* Cancel the call if the initial transmission fails or if we + * hit due to network routing issues that aren't going away + * anytime soon. The layer above can arrange the + * retransmission. */ - if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + if (new_call || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); - return ret; +done: + _leave(" = %d [%u]", ret, call->peer->max_data); } /* @@ -692,41 +896,3 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } - -/* - * Schedule an instant Tx resend. - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, - struct rxrpc_txbuf *txb) -{ - if (!__rxrpc_call_is_complete(call)) - kdebug("resend"); -} - -/* - * Transmit one packet. - */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) -{ - int ret; - - ret = rxrpc_send_data_packet(call, txb); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - break; - default: - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, txb); - } - } else { - ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - - call->resend_at = ktime_add(ktime_get_real(), delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx); - } -} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c..d82e44a3901b 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,17 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); } } @@ -205,23 +213,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, struct rxrpc_call *call; HLIST_HEAD(error_targets); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); hlist_move_list(&peer->error_targets, &error_targets); while (!hlist_empty(&error_targets)) { call = hlist_entry(error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); } - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); } /* @@ -347,3 +355,89 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) { + preempt_disable(); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); + preempt_enable(); + } + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d5..e1c63129586b 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -222,11 +235,9 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - spin_lock_init(&peer->rtt_input_lock); + seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - - rxrpc_peer_init_rtt(peer); - + peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } @@ -242,9 +253,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +277,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); } /* @@ -304,6 +315,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. + * Called with interrupts disabled. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { @@ -479,7 +491,7 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; + return READ_ONCE(peer->recent_srtt_us); } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 263a2251e3d2..d803562ca0ac 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - rxrpc_seq_t acks_hard_ack; + rxrpc_seq_t tx_bottom; char lbuff[50], rbuff[50]; long timeout = 0; @@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) if (state != RXRPC_CALL_SERVER_PREALLOC) timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); - acks_hard_ack = READ_ONCE(call->acks_hard_ack); + tx_bottom = READ_ONCE(call->tx_bottom); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[state], call->abort_code, call->debug_id, - acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, + tx_bottom, READ_ONCE(call->tx_top) - tx_bottom, call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, @@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, - peer->srtt_us >> 3, - peer->rto_us); + READ_ONCE(peer->recent_srtt_us), + READ_ONCE(peer->recent_rto_us)); return 0; } @@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u fail=%u\n", + "Data : send=%u sendf=%u fail=%u emsz=%u\n", atomic_read(&rxnet->stat_tx_data_send), atomic_read(&rxnet->stat_tx_data_send_frag), - atomic_read(&rxnet->stat_tx_data_send_fail)); + atomic_read(&rxnet->stat_tx_data_send_fail), + atomic_read(&rxnet->stat_tx_data_send_msgsize)); seq_printf(seq, "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), @@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), @@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, + "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_tx_jumbo[0]), + atomic_read(&rxnet->stat_tx_jumbo[1]), + atomic_read(&rxnet->stat_tx_jumbo[2]), + atomic_read(&rxnet->stat_tx_jumbo[3]), + atomic_read(&rxnet->stat_tx_jumbo[4]), + atomic_read(&rxnet->stat_tx_jumbo[5]), + atomic_read(&rxnet->stat_tx_jumbo[6]), + atomic_read(&rxnet->stat_tx_jumbo[7]), + atomic_read(&rxnet->stat_tx_jumbo[8]), + atomic_read(&rxnet->stat_tx_jumbo[9])); + seq_printf(seq, + "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_rx_jumbo[0]), + atomic_read(&rxnet->stat_rx_jumbo[1]), + atomic_read(&rxnet->stat_rx_jumbo[2]), + atomic_read(&rxnet->stat_rx_jumbo[3]), + atomic_read(&rxnet->stat_rx_jumbo[4]), + atomic_read(&rxnet->stat_rx_jumbo[5]), + atomic_read(&rxnet->stat_rx_jumbo[6]), + atomic_read(&rxnet->stat_rx_jumbo[7]), + atomic_read(&rxnet->stat_rx_jumbo[8]), + atomic_read(&rxnet->stat_rx_jumbo[9])); + seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_rx_skbs)); @@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo)); + memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4fe6b4d20ada..42f70e4636f8 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a482f88c5fc5..32cd5f1d541d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } else { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -337,14 +337,14 @@ try_again: * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) { list_del_init(&call->recvmsg_link); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -355,7 +355,7 @@ try_again: list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -445,9 +445,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index cdab7b7d08a0..7474f88d7b18 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -12,22 +12,22 @@ #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) -#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } -static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { - return (peer->srtt_us >> 3) + peer->rttvar_us; + return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* @@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ - u32 srtt = peer->srtt_us; + u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) if (m > 0) m >>= 3; } else { - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ } - peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (peer->mdev_us > peer->mdev_max_us) { - peer->mdev_max_us = peer->mdev_us; - if (peer->mdev_max_us > peer->rttvar_us) - peer->rttvar_us = peer->mdev_max_us; + call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (call->mdev_us > call->mdev_max_us) { + call->mdev_max_us = call->mdev_us; + if (call->mdev_max_us > call->rttvar_us) + call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); - peer->mdev_max_us = peer->rttvar_us; + call->mdev_us = m << 1; /* make sure rto = 3*rtt */ + call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); + call->mdev_max_us = call->rttvar_us; } - peer->srtt_us = max(1U, srtt); + call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void rxrpc_set_rto(struct rxrpc_peer *peer) +static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; @@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - rto = __rxrpc_set_rto(peer); + rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_us = rxrpc_bound_rto(rto); + call->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) +{ + /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ + u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; + + minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, + (u32)rtt_us ? : jiffies_to_usecs(1)); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; - //rxrpc_update_rtt_min(peer, rtt_us); - rxrpc_rtt_estimator(peer, rtt_us); - rxrpc_set_rto(peer); + /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ + rxrpc_update_rtt_min(call, resp_time, rtt_us); + + rxrpc_rtt_estimator(call, rtt_us); + rxrpc_set_rto(call); - /* RFC6298: only reset backoff on valid RTT measurement. */ - peer->backoff = 0; + /* Only reset backoff on valid RTT measurement [RFC6298]. */ + call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. + * exclusive access to the call RTT data. */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { - struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; - spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, rtt_us); - if (peer->rtt_count < 3) - peer->rtt_count++; - spin_unlock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(call, resp_time, rtt_us); + if (call->rtt_count < 3) + call->rtt_count++; + call->rtt_taken++; + + WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); + WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_us); + rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; - u32 backoff = READ_ONCE(peer->backoff); + u32 backoff = READ_ONCE(call->backoff); - timo_us = peer->rto_us; + timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) - WRITE_ONCE(peer->backoff, backoff + 1); + WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; @@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) return ns_to_ktime(timo_us * NSEC_PER_USEC); } -void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +void rxrpc_call_init_rtt(struct rxrpc_call *call) { - peer->rto_us = RXRPC_TIMEOUT_INIT; - peer->mdev_us = RXRPC_TIMEOUT_INIT; - peer->backoff = 0; - //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); + call->rtt_last_req = KTIME_MIN; + call->rto_us = RXRPC_TIMEOUT_INIT; + call->mdev_us = RXRPC_TIMEOUT_INIT; + call->backoff = 0; + //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 48a1475e6b06..62b09d23ec08 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,14 +148,14 @@ error: static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { struct rxrpc_txbuf *txb; - size_t shdr, space; + size_t shdr, alloc, limit, part; - remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 1, gfp); + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -164,15 +164,21 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); - space = round_up(space, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); if (!txb) return NULL; txb->offset += shdr; - txb->space -= shdr; + txb->space = part; return txb; } @@ -263,13 +269,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; + txb->pkt_len += pad; } /* start the encryption afresh */ @@ -298,7 +304,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; @@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; - } /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, rxkhdr, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -384,19 +387,33 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = whdr + 1; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d310..7ef93407be83 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -503,7 +503,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cb8dd1d3b1d4..9784adc8f275 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -114,10 +114,10 @@ found: if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6abb8eec1b2b..0e8da909d4f2 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -94,9 +94,11 @@ no_wait: */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { + rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); + if (_tx_win) - *_tx_win = call->tx_bottom; - return call->tx_prepared - call->tx_bottom < 256; + *_tx_win = tx_bottom; + return call->send_top - tx_bottom < 256; } /* @@ -132,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; - rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; - tx_start = smp_load_acquire(&call->acks_hard_ack); + tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -195,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u,%u}", - call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u}", + call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); @@ -240,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { + struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; - + int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); - - /* We have to set the timestamp before queueing as the retransmit - * algorithm can see the packet as soon as we queue it. - */ - txb->last_sent = ktime_get_real(); + ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + if (WARN_ON_ONCE(sq->bufs[ix])) + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); + else + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); + /* Add the packet to the call's output buffer */ - spin_lock(&call->tx_lock); - poke = list_empty(&call->tx_sendmsg); - list_add_tail(&txb->call_link, &call->tx_sendmsg); - call->tx_prepared = seq; - if (last) + poke = (READ_ONCE(call->tx_bottom) == call->send_top); + sq->bufs[ix] = txb; + /* Order send_top after the queue->next pointer and txb content. */ + smp_store_release(&call->send_top, seq); + if (last) { + set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); - spin_unlock(&call->tx_lock); + call->send_queue = NULL; + } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* + * Allocate a new txqueue unit and add it to the transmission queue. + */ +static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + + tq = kzalloc(sizeof(*tq), sk->sk_allocation); + if (!tq) + return -ENOMEM; + + tq->xmit_ts_base = KTIME_MIN; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + tq->segment_xmit_ts[i] = UINT_MAX; + + if (call->send_queue) { + tq->qbase = call->send_top + 1; + call->send_queue->next = tq; + call->send_queue = tq; + } else if (WARN_ON(call->tx_queue)) { + kfree(tq); + return -ENOMEM; + } else { + /* We start at seq 1, so pretend seq 0 is hard-acked. */ + tq->nr_reported_acks = 1; + tq->segment_acked = 1UL; + tq->qbase = 0; + call->tx_qbase = 0; + call->send_queue = tq; + call->tx_qtail = tq; + call->tx_queue = tq; + } + + trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); + return 0; +} + +/* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. @@ -288,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; + if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); + return -EPROTO; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); @@ -344,6 +393,13 @@ reload: if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; + /* See if we need to begin/extend the Tx queue. */ + if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { + ret = rxrpc_alloc_txqueue(sk, call); + if (ret < 0) + goto maybe_error; + } + /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. @@ -360,7 +416,7 @@ reload: /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, @@ -385,16 +441,12 @@ reload: (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; txb->kvec[0].iov_len += txb->len; - txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 9bf9a1f6e4cb..46a20cf4c402 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,6 +11,8 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; @@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c3913d8a50d3..131d9e55c8e9 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -24,7 +24,7 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ size_t total, hoff; void *buf; - txb = kmalloc(sizeof(*txb), gfp); + txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; @@ -43,20 +43,14 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ whdr = buf + hoff; - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); - txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; txb->space = data_size; - txb->len = 0; txb->offset = sizeof(*whdr); txb->flags = call->conn->out_clientflag; - txb->ack_why = 0; - txb->seq = call->tx_prepared + 1; - txb->serial = 0; - txb->cksum = 0; + txb->seq = call->send_top + 1; txb->nr_kvec = 1; txb->kvec[0].iov_base = whdr; txb->kvec[0].iov_len = sizeof(*whdr); @@ -79,84 +73,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -/* - * Allocate and partially initialise an ACK packet. - */ -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_acktrailer *trailer; - struct rxrpc_ackpacket *ack; - struct rxrpc_txbuf *txb; - gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; - void *buf, *buf2 = NULL; - u8 *filler; - - txb = kmalloc(sizeof(*txb), gfp); - if (!txb) - return NULL; - - buf = page_frag_alloc(&call->local->tx_alloc, - sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); - if (!buf) { - kfree(txb); - return NULL; - } - - if (sack_size) { - buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); - if (!buf2) { - page_frag_free(buf); - kfree(txb); - return NULL; - } - } - - whdr = buf; - ack = buf + sizeof(*whdr); - filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; - trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; - - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = 0; - txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer); - txb->offset = 0; - txb->flags = call->conn->out_clientflag; - txb->ack_rwind = 0; - txb->seq = 0; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 3; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack); - txb->kvec[1].iov_base = buf2; - txb->kvec[1].iov_len = sack_size; - txb->kvec[2].iov_base = filler; - txb->kvec[2].iov_len = 3 + sizeof(*trailer); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = 0; - whdr->type = RXRPC_PACKET_TYPE_ACK; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); - - get_page(virt_to_head_page(trailer)); - - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); - return txb; -} - void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r; @@ -179,7 +95,8 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base) + if (txb->kvec[i].iov_base && + !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) page_frag_free(txb->kvec[i].iov_base); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); @@ -202,37 +119,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) rxrpc_free_txbuf(txb); } } - -/* - * Shrink the transmit buffer. - */ -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) -{ - struct rxrpc_txbuf *txb; - rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); - bool wake = false; - - _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - hard_ack = smp_load_acquire(&call->acks_hard_ack); - if (before(hard_ack, txb->seq)) - break; - - if (txb->seq != call->tx_bottom + 1) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); - ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - smp_store_release(&call->tx_bottom, call->tx_bottom + 1); - list_del_rcu(&txb->call_link); - - trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); - if (after(call->acks_hard_ack, call->tx_bottom + 128)) - wake = true; - } - - if (wake) - wake_up(&call->waitq); -} diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c..deb0925f536d 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -484,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static bool cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) +static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) { - bool next_due, over_target, drop = false; + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -533,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ - drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) + reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) @@ -556,16 +558,17 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop) - drop |= (get_random_u32() < vars->p_drop); + if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + get_random_u32() < vars->p_drop) + reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && !drop) + else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; - return drop; + return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, @@ -1528,12 +1531,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) flow->dropped++; b->tin_dropped++; - sch->qstats.drops++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - __qdisc_drop(skb, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, len); @@ -1926,7 +1928,7 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin) q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) @@ -1934,6 +1936,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; struct cake_host *srchost, *dsthost; + enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; @@ -2143,12 +2146,12 @@ retry: goto begin; } + reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, - (b->bulk_flow_count * - !!(q->rate_flags & - CAKE_FLAG_INGRESS))) || - !flow->head) + if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ @@ -2162,7 +2165,7 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb(skb); + kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 3e8d4fe4d91e..81189d02fee7 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -89,7 +89,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a5e87f9ea986..2ca5332cfcc5 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -537,6 +537,8 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } +#define FQDR(reason) SKB_DROP_REASON_FQ_##reason + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -548,7 +550,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(BAND_LIMIT)); } now = ktime_get_ns(); @@ -558,8 +561,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q, now)) { if (q->horizon_drop) { - q->stat_horizon_drops++; - return qdisc_drop(skb, sch, to_free); + q->stat_horizon_drops++; + return qdisc_drop_reason(skb, sch, to_free, + FQDR(HORIZON_LIMIT)); } q->stat_horizon_caps++; skb->tstamp = now + q->horizon; @@ -572,7 +576,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (f != &q->internal) { if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(FLOW_LIMIT)); } if (fq_flow_is_detached(f)) { @@ -597,6 +602,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } +#undef FQDR static void fq_check_throttled(struct fq_sched_data *q, u64 now) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4f908c11ba95..799f5397ad4c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -168,6 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -274,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index c38f33ff80bd..93c36afbf576 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,6 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; int ret; @@ -161,6 +162,8 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->overmemory++; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, sel_flow->backlog, skb->len)) { enqueue = true; @@ -198,8 +201,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; sel_flow->vars.accu_prob = 0; - __qdisc_drop(skb, to_free); - qdisc_qstats_drop(sch); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 38ec18f73de4..8874ae668095 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -911,8 +911,8 @@ static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, bands[prio] = q; } - return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, - GFP_KERNEL); + return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, + GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 7d2151c62c4a..ab6234b4fcd5 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stats.pdrop++; drop: - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); congestion_drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); return NET_XMIT_CN; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b3dcb845b327..bb1fa9aa530b 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -93,6 +94,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; @@ -121,7 +124,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; q->vars.accu_prob = 0; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, reason); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6029bc29b51e..ef8a2afed26b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -70,6 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q) static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; unsigned int len; @@ -107,6 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case RED_HARD_MARK: + reason = SKB_DROP_REASON_QDISC_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { q->stats.forced_drop++; @@ -143,7 +145,7 @@ congestion_drop: if (!skb) return NET_XMIT_CN | ret; - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index b717e15a3a17..d2835f1168e1 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -280,6 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; @@ -380,6 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } r = get_random_u16() & SFB_MAX_PROB; + reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -414,7 +416,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a4b8296a2fa1..65d5b59da583 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -652,6 +652,10 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, if (!p) return -ENOMEM; } + if (ctl->limit == 1) { + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; + } sch_tree_lock(sch); if (ctl->quantum) q->quantum = ctl->quantum; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6cc7b846cff1..c370efcfe3e8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1117,7 +1117,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || - smc->clcsock->sk->sk_family != AF_INET || +#if IS_ENABLED(CONFIG_IPV6) + (smc->clcsock->sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || +#endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3b125d348b4a..ccf57b7fe602 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -795,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; + lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; + lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lnk->wr_rx_sge_cnt = 1; + lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 69b54ecd6503..48a1b1dcb576 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -122,10 +122,14 @@ struct smc_link { } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ + int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ + int wr_rx_buflen; /* buffer len for the first sge, len for the + * second sge is lgr shared if rx sge is 2. + */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ @@ -506,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) +{ + return lnk->wr_rx_sge_cnt > 1; +} + static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9c563cdbea90..53828833a3f7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -662,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -676,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = sges_per_buf, + .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 018ce8133b02..f865c58c3aa7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new) + struct smc_link *link_new, + u8 *llc_msg) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); down_write(&lgr->rmbs_lock); @@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc; + smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc; + smc_llc_save_add_link_rkeys(link, link_new, llc_msg); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + if (smc_link_shared_v2_rxbuf(link)) { + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + } else { + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + } llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 994c0cd4fddb..b04a21b8c511 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk) * the larger spillover buffer, allowing easy data mapping. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int x = i * sges_per_buf; + int x = i * lnk->wr_rx_sge_cnt; lnk->wr_rx_sges[x].addr = - lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen; + lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_TX_SIZE : lnk->wr_rx_buflen; lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; - if (lnk->lgr->smc_version == SMC_V2) { + if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) { lnk->wr_rx_sges[x + 1].addr = lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; lnk->wr_rx_sges[x + 1].length = @@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) } lnk->wr_rx_ibs[i].next = NULL; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; - lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -655,7 +655,7 @@ void smc_wr_free_link(struct smc_link *lnk) if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } @@ -740,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { - int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; - /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -774,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]) * sges_per_buf, + sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -872,7 +870,7 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -880,13 +878,15 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { - lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, - lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { - lnk->wr_rx_v2_dma_addr = 0; - rc = -EIO; - goto dma_unmap; + if (smc_link_shared_v2_rxbuf(lnk)) { + lnk->wr_rx_v2_dma_addr = + ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2, + SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, @@ -935,7 +935,7 @@ dma_unmap: lnk->wr_tx_v2_dma_addr = 0; } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; out: diff --git a/net/socket.c b/net/socket.c index 9a117248f18f..16402b8be5a7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1008,12 +1008,23 @@ static void sock_recv_mark(struct msghdr *msg, struct sock *sk, } } +static void sock_recv_priority(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) { + __u32 priority = skb->priority; + + put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority); + } +} + void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); + sock_recv_priority(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index d1180370fdf4..e74940eab3a4 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -949,8 +949,8 @@ void tipc_nametbl_stop(struct net *net) } spin_unlock_bh(&tn->nametbl_lock); - synchronize_net(); - kfree(nt); + /* TODO: clear tn->nametbl, implement proper RCU rules ? */ + kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 3bcd9ef8cee3..7ff6eeebaae6 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -90,6 +90,7 @@ struct publication { /** * struct name_table - table containing all existing port name publications + * @rcu: RCU callback head used for deferred freeing * @services: name sequence hash lists * @node_scope: all local publications with node scope * - used by name_distr during re-init of name table @@ -102,6 +103,7 @@ struct publication { * @snd_nxt: next sequence number to be used */ struct name_table { + struct rcu_head rcu; struct hlist_head services[TIPC_NAMETBL_SIZE]; struct list_head node_scope; struct list_head cluster_scope; diff --git a/net/tls/tls.h b/net/tls/tls.h index e5e47452308a..774859b63f0d 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -145,7 +145,8 @@ void tls_err_abort(struct sock *sk, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); -int tls_set_sw_offload(struct sock *sk, int tx); +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index dc063c2c7950..e50b6e71df13 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1227,7 +1227,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6b4b9f2749a6..9ee5a83c5b40 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -423,9 +423,10 @@ static __poll_t tls_sk_poll(struct file *file, struct socket *sock, ctx = tls_sw_ctx_rx(tls_ctx); psock = sk_psock_get(sk); - if (skb_queue_empty_lockless(&ctx->rx_list) && - !tls_strp_msg_ready(ctx) && - sk_psock_queue_empty(psock)) + if ((skb_queue_empty_lockless(&ctx->rx_list) && + !tls_strp_msg_ready(ctx) && + sk_psock_queue_empty(psock)) || + READ_ONCE(ctx->key_update_pending)) mask &= ~(EPOLLIN | EPOLLRDNORM); if (psock) @@ -612,11 +613,13 @@ static int validate_crypto_info(const struct tls_crypto_info *crypto_info, static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { - struct tls_crypto_info *crypto_info; - struct tls_crypto_info *alt_crypto_info; + struct tls_crypto_info *crypto_info, *alt_crypto_info; + struct tls_crypto_info *old_crypto_info = NULL; struct tls_context *ctx = tls_get_ctx(sk); const struct tls_cipher_desc *cipher_desc; union tls_crypto_context *crypto_ctx; + union tls_crypto_context tmp = {}; + bool update = false; int rc = 0; int conf; @@ -633,9 +636,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, crypto_info = &crypto_ctx->info; - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - return -EBUSY; + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + /* Currently we only support setting crypto info more + * than one time for TLS 1.3 + */ + if (crypto_info->version != TLS_1_3_VERSION) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); + return -EBUSY; + } + + update = true; + old_crypto_info = crypto_info; + crypto_info = &tmp.info; + crypto_ctx = &tmp; + } rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { @@ -643,7 +658,14 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, goto err_crypto_info; } - rc = validate_crypto_info(crypto_info, alt_crypto_info); + if (update) { + /* Ensure that TLS version and ciphers are not modified */ + if (crypto_info->version != old_crypto_info->version || + crypto_info->cipher_type != old_crypto_info->cipher_type) + rc = -EINVAL; + } else { + rc = validate_crypto_info(crypto_info, alt_crypto_info); + } if (rc) goto err_crypto_info; @@ -673,11 +695,17 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } else { - rc = tls_set_sw_offload(sk, 1); + rc = tls_set_sw_offload(sk, 1, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + } conf = TLS_SW; } } else { @@ -687,14 +715,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); } else { - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } conf = TLS_SW; } - tls_sw_strparser_arm(sk, ctx); + if (!update) + tls_sw_strparser_arm(sk, ctx); } if (tx) @@ -713,6 +748,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, return 0; err_crypto_info: + if (update) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); + } memzero_explicit(crypto_ctx, sizeof(*crypto_ctx)); return rc; } diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 68982728f620..367666aa07b8 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -22,6 +22,11 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), + SNMP_MIB_ITEM("TlsRxRekeyOk", LINUX_MIB_TLSRXREKEYOK), + SNMP_MIB_ITEM("TlsRxRekeyError", LINUX_MIB_TLSRXREKEYERROR), + SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), + SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), + SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), SNMP_MIB_SENTINEL }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee..47550d485819 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1314,6 +1314,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, int ret = 0; long timeo; + /* a rekey is pending, let userspace deal with it */ + if (unlikely(ctx->key_update_pending)) + return -EKEYEXPIRED; + timeo = sock_rcvtimeo(sk, nonblock); while (!tls_strp_msg_ready(ctx)) { @@ -1720,6 +1724,36 @@ tls_decrypt_device(struct sock *sk, struct msghdr *msg, return 1; } +static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx, + struct sk_buff *skb) +{ + const struct strp_msg *rxm = strp_msg(skb); + const struct tls_msg *tlm = tls_msg(skb); + char hs_type; + int err; + + if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE)) + return 0; + + if (rxm->full_len < 1) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &hs_type, 1); + if (err < 0) { + DEBUG_NET_WARN_ON_ONCE(1); + return err; + } + + if (hs_type == TLS_HANDSHAKE_KEYUPDATE) { + struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; + + WRITE_ONCE(rx_ctx->key_update_pending, true); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED); + } + + return 0; +} + static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, struct tls_decrypt_arg *darg) { @@ -1739,7 +1773,7 @@ static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - return 0; + return tls_check_pending_rekey(sk, tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) @@ -2684,12 +2718,22 @@ int init_prot_info(struct tls_prot_info *prot, return 0; } -int tls_set_sw_offload(struct sock *sk, int tx) +static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx) +{ + struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx; + + WRITE_ONCE(ctx->key_update_pending, false); + /* wake-up pre-existing poll() */ + ctx->saved_data_ready(sk); +} + +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info) { + struct tls_crypto_info *crypto_info, *src_crypto_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; const struct tls_cipher_desc *cipher_desc; - struct tls_crypto_info *crypto_info; char *iv, *rec_seq, *key, *salt; struct cipher_context *cctx; struct tls_prot_info *prot; @@ -2701,44 +2745,47 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx = tls_get_ctx(sk); prot = &ctx->prot_info; - if (tx) { - ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); - if (!ctx->priv_ctx_tx) - return -ENOMEM; + /* new_crypto_info != NULL means rekey */ + if (!new_crypto_info) { + if (tx) { + ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); + if (!ctx->priv_ctx_tx) + return -ENOMEM; + } else { + ctx->priv_ctx_rx = init_ctx_rx(ctx); + if (!ctx->priv_ctx_rx) + return -ENOMEM; + } + } + if (tx) { sw_ctx_tx = ctx->priv_ctx_tx; crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { - ctx->priv_ctx_rx = init_ctx_rx(ctx); - if (!ctx->priv_ctx_rx) - return -ENOMEM; - sw_ctx_rx = ctx->priv_ctx_rx; crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } - cipher_desc = get_cipher_desc(crypto_info->cipher_type); + src_crypto_info = new_crypto_info ?: crypto_info; + + cipher_desc = get_cipher_desc(src_crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto free_priv; } - rc = init_prot_info(prot, crypto_info, cipher_desc); + rc = init_prot_info(prot, src_crypto_info, cipher_desc); if (rc) goto free_priv; - iv = crypto_info_iv(crypto_info, cipher_desc); - key = crypto_info_key(crypto_info, cipher_desc); - salt = crypto_info_salt(crypto_info, cipher_desc); - rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); - - memcpy(cctx->iv, salt, cipher_desc->salt); - memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); - memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + iv = crypto_info_iv(src_crypto_info, cipher_desc); + key = crypto_info_key(src_crypto_info, cipher_desc); + salt = crypto_info_salt(src_crypto_info, cipher_desc); + rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc); if (!*aead) { *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); @@ -2751,20 +2798,30 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx->push_pending_record = tls_sw_push_pending_record; + /* setkey is the last operation that could fail during a + * rekey. if it succeeds, we can start modifying the + * context. + */ rc = crypto_aead_setkey(*aead, key, cipher_desc->key); - if (rc) - goto free_aead; + if (rc) { + if (new_crypto_info) + goto out; + else + goto free_aead; + } - rc = crypto_aead_setauthsize(*aead, prot->tag_size); - if (rc) - goto free_aead; + if (!new_crypto_info) { + rc = crypto_aead_setauthsize(*aead, prot->tag_size); + if (rc) + goto free_aead; + } - if (sw_ctx_rx) { + if (!tx && !new_crypto_info) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); tls_update_rx_zc_capable(ctx); sw_ctx_rx->async_capable = - crypto_info->version != TLS_1_3_VERSION && + src_crypto_info->version != TLS_1_3_VERSION && !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); rc = tls_strp_init(&sw_ctx_rx->strp, sk); @@ -2772,18 +2829,33 @@ int tls_set_sw_offload(struct sock *sk, int tx) goto free_aead; } + memcpy(cctx->iv, salt, cipher_desc->salt); + memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); + memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + + if (new_crypto_info) { + unsafe_memcpy(crypto_info, new_crypto_info, + cipher_desc->crypto_info, + /* size was checked in do_tls_setsockopt_conf */); + memzero_explicit(new_crypto_info, cipher_desc->crypto_info); + if (!tx) + tls_finish_key_update(sk, ctx); + } + goto out; free_aead: crypto_free_aead(*aead); *aead = NULL; free_priv: - if (tx) { - kfree(ctx->priv_ctx_tx); - ctx->priv_ctx_tx = NULL; - } else { - kfree(ctx->priv_ctx_rx); - ctx->priv_ctx_rx = NULL; + if (!new_crypto_info) { + if (tx) { + kfree(ctx->priv_ctx_tx); + ctx->priv_ctx_tx = NULL; + } else { + kfree(ctx->priv_ctx_rx); + ctx->priv_ctx_rx = NULL; + } } out: return rc; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6b1762300443..8f2b605ce5b3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -286,14 +286,9 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -static inline int unix_our_peer(struct sock *sk, struct sock *osk) -{ - return unix_peer(osk) == sk; -} - static inline int unix_may_send(struct sock *sk, struct sock *osk) { - return unix_peer(osk) == NULL || unix_our_peer(sk, osk); + return !unix_peer(osk) || unix_peer(osk) == sk; } static inline int unix_recvq_full_lockless(const struct sock *sk) @@ -1563,32 +1558,30 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. - If we will make it after state is locked, - we will have to recheck all again in any case. + * If we will make it after state is locked, + * we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); - newsk = NULL; goto out; } - err = -ENOMEM; - /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); - if (skb == NULL) - goto out; + if (!skb) { + err = -ENOMEM; + goto out_free_sk; + } restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; - goto out; + goto out_free_skb; } unix_state_lock(other); @@ -1600,23 +1593,25 @@ restart: goto restart; } - err = -ECONNREFUSED; - if (other->sk_state != TCP_LISTEN) - goto out_unlock; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_state != TCP_LISTEN || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -ECONNREFUSED; goto out_unlock; + } if (unix_recvq_full_lockless(other)) { - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; goto out_unlock; + } timeo = unix_wait_for_peer(other, timeo); + sock_put(other); err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out; - sock_put(other); + goto out_free_skb; + goto restart; } @@ -1701,15 +1696,13 @@ restart: return 0; out_unlock: - if (other) - unix_state_unlock(other); - -out: + unix_state_unlock(other); + sock_put(other); +out_free_skb: kfree_skb(skb); - if (newsk) - unix_release_sock(newsk, 0); - if (other) - sock_put(other); +out_free_sk: + unix_release_sock(newsk, 0); +out: return err; } @@ -1964,7 +1957,6 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; @@ -1980,12 +1972,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) + if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; goto out; + } if (msg->msg_namelen) { - err = unix_validate_addr(sunaddr, msg->msg_namelen); + err = unix_validate_addr(msg->msg_name, msg->msg_namelen); if (err) goto out; @@ -1995,12 +1988,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, NULL); if (err) goto out; - } else { - sunaddr = NULL; - err = -ENOTCONN; - other = unix_peer_get(sk); - if (!other) - goto out; } if ((test_bit(SOCK_PASSCRED, &sock->flags) || @@ -2011,9 +1998,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - err = -EMSGSIZE; - if (len > READ_ONCE(sk->sk_sndbuf) - 32) + if (len > READ_ONCE(sk->sk_sndbuf) - 32) { + err = -EMSGSIZE; goto out; + } if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, @@ -2027,7 +2015,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); - if (skb == NULL) + if (!skb) goto out; err = unix_scm_to_skb(&scm, skb, true); @@ -2043,17 +2031,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); -restart: - if (!other) { - err = -ECONNRESET; - if (sunaddr == NULL) - goto out_free; - - other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, - sk->sk_type); + if (msg->msg_namelen) { +lookup: + other = unix_find_other(sock_net(sk), msg->msg_name, + msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; + goto out_free; + } + } else { + other = unix_peer_get(sk); + if (!other) { + err = -ENOTCONN; goto out_free; } } @@ -2061,36 +2050,37 @@ restart: if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; - goto out_free; + goto out_sock_put; } +restart: sk_locked = 0; unix_state_lock(other); restart_locked: - err = -EPERM; - if (!unix_may_send(sk, other)) + + if (!unix_may_send(sk, other)) { + err = -EPERM; goto out_unlock; + } if (unlikely(sock_flag(other, SOCK_DEAD))) { - /* - * Check with 1003.1g - what should - * datagram error - */ - unix_state_unlock(other); - sock_put(other); + /* Check with 1003.1g - what should datagram error */ - if (!sk_locked) - unix_state_lock(sk); + unix_state_unlock(other); - err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ - unix_state_unlock(sk); err = -EPIPE; - } else if (unix_peer(sk) == other) { + goto out_sock_put; + } + + if (!sk_locked) + unix_state_lock(sk); + + if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); @@ -2100,19 +2090,23 @@ restart_locked: unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; - } else { - unix_state_unlock(sk); + goto out_sock_put; } - other = NULL; - if (err) - goto out_free; - goto restart; + unix_state_unlock(sk); + + if (!msg->msg_namelen) { + err = -ECONNRESET; + goto out_sock_put; + } + + goto lookup; } - err = -EPIPE; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; goto out_unlock; + } if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); @@ -2132,7 +2126,7 @@ restart_locked: err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out_free; + goto out_sock_put; goto restart; } @@ -2173,11 +2167,11 @@ out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); +out_sock_put: + sock_put(other); out_free: kfree_skb(skb); out: - if (other) - sock_put(other); scm_destroy(&scm); return err; } @@ -2256,8 +2250,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; @@ -2270,14 +2264,20 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { - err = -ENOTCONN; other = unix_peer(sk); - if (!other) + if (!other) { + err = -ENOTCONN; goto out_err; + } } - if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) - goto pipe_err; + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { + if (!(msg->msg_flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + + err = -EPIPE; + goto out_err; + } while (sent < len) { size = len - sent; @@ -2306,20 +2306,18 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { @@ -2327,17 +2325,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); - if (err) { - kfree_skb(skb); - goto out_err; - } + if (err) + goto out_free; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto pipe_err_free; + goto out_pipe; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); @@ -2360,13 +2356,13 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -pipe_err_free: +out_pipe: unix_state_unlock(other); - kfree_skb(skb); -pipe_err: - if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) + if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; +out_free: + kfree_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 2fbfb6a94c11..eeb49597f4f4 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -837,7 +837,7 @@ impl DeviceMask { /// [::kernel::net::phy::create_phy_driver::<PhySample>()]; /// /// impl ::kernel::Module for Module { -/// fn init(module: &'static ThisModule) -> Result<Self> { +/// fn init(module: &'static ::kernel::ThisModule) -> Result<Self> { /// let drivers = unsafe { &mut DRIVERS }; /// let mut reg = ::kernel::net::phy::Registration::register( /// module, @@ -903,7 +903,7 @@ macro_rules! module_phy_driver { [$($crate::net::phy::create_phy_driver::<$driver>()),+]; impl $crate::Module for Module { - fn init(module: &'static ThisModule) -> Result<Self> { + fn init(module: &'static $crate::ThisModule) -> Result<Self> { // SAFETY: The anonymous constant guarantees that nobody else can access // the `DRIVERS` static. The array is used only in the C side. let drivers = unsafe { &mut DRIVERS }; diff --git a/scripts/.gitignore b/scripts/.gitignore index 3dbb8bb2457b..c2ef68848da5 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only /asn1_compiler +/gen_packed_field_checks /generate_rust_target /insert-sys-cert /kallsyms diff --git a/scripts/Makefile b/scripts/Makefile index 6bcda4b9d054..546e8175e1c4 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -47,7 +47,7 @@ HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED endif # The following programs are only built on demand -hostprogs += unifdef +hostprogs += unifdef gen_packed_field_checks # The module linker script is preprocessed on demand targets += module.lds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9eed3683ad76..a2066a6c9dd8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5843,6 +5843,8 @@ sub process { #CamelCase if ($var !~ /^$Constant$/ && $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && +#Ignore C keywords + $var !~ /^_Generic$/ && #Ignore some autogenerated defines and enum values $var !~ /^(?:[A-Z]+_){1,5}[A-Z]{1,3}[a-z]/ && #Ignore Page<foo> variants diff --git a/scripts/gen_packed_field_checks.c b/scripts/gen_packed_field_checks.c new file mode 100644 index 000000000000..60042b7616ee --- /dev/null +++ b/scripts/gen_packed_field_checks.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024, Intel Corporation +#include <stdbool.h> +#include <stdio.h> + +#define MAX_PACKED_FIELD_SIZE 50 + +int main(int argc, char **argv) +{ + /* The first macro doesn't need a 'do {} while(0)' loop */ + printf("#define CHECK_PACKED_FIELDS_1(fields) \\\n"); + printf("\tCHECK_PACKED_FIELD(fields, 0)\n\n"); + + /* Remaining macros require a do/while loop, and are implemented + * recursively by calling the previous iteration's macro. + */ + for (int i = 2; i <= MAX_PACKED_FIELD_SIZE; i++) { + printf("#define CHECK_PACKED_FIELDS_%d(fields) do { \\\n", i); + printf("\tCHECK_PACKED_FIELDS_%d(fields); \\\n", i - 1); + printf("\tCHECK_PACKED_FIELD(fields, %d); \\\n", i - 1); + printf("} while (0)\n\n"); + } + + printf("#define CHECK_PACKED_FIELDS(fields) \\\n"); + + for (int i = 1; i <= MAX_PACKED_FIELD_SIZE; i++) + printf("\t__builtin_choose_expr(ARRAY_SIZE(fields) == %d, ({ CHECK_PACKED_FIELDS_%d(fields); }), \\\n", + i, i); + + printf("\t({ BUILD_BUG_ON_MSG(1, \"CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than %d.\"); }) \\\n", + MAX_PACKED_FIELD_SIZE); + + for (int i = 1; i <= MAX_PACKED_FIELD_SIZE; i++) + printf(")"); + + printf("\n"); +} diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index 281df9139d2b..ffff554a5230 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -126,6 +126,8 @@ #define SCM_TS_OPT_ID 78 +#define SO_RCVPRIORITY 79 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index d8201c4b1520..ec2288948795 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -801,6 +801,8 @@ class EnumSet(SpecEnumSet): self.user_type = 'int' self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-") + self.header = yaml.get('header', None) + self.enum_cnt_name = yaml.get('enum-cnt-name', None) super().__init__(family, yaml) @@ -2417,6 +2419,87 @@ def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'): cw.block_start(line=start_line) +def render_uapi_unified(family, cw, max_by_define, separate_ntf): + max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX")) + cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX")) + max_value = f"({cnt_name} - 1)" + + uapi_enum_start(family, cw, family['operations'], 'enum-name') + val = 0 + for op in family.msgs.values(): + if separate_ntf and ('notify' in op or 'event' in op): + continue + + suffix = ',' + if op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(op.enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + +def render_uapi_directional(family, cw, max_by_define): + max_name = f"{family.op_prefix}USER_MAX" + cnt_name = f"__{family.op_prefix}USER_CNT" + max_value = f"({cnt_name} - 1)" + + cw.block_start(line='enum') + cw.p(c_upper(f'{family.name}_MSG_USER_NONE = 0,')) + val = 0 + for op in family.msgs.values(): + if 'do' in op and 'event' not in op: + suffix = ',' + if op.value and op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(op.enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + max_name = f"{family.op_prefix}KERNEL_MAX" + cnt_name = f"__{family.op_prefix}KERNEL_CNT" + max_value = f"({cnt_name} - 1)" + + cw.block_start(line='enum') + cw.p(c_upper(f'{family.name}_MSG_KERNEL_NONE = 0,')) + val = 0 + for op in family.msgs.values(): + if ('do' in op and 'reply' in op['do']) or 'notify' in op or 'event' in op: + enum_name = op.enum_name + if 'event' not in op and 'notify' not in op: + enum_name = f'{enum_name}_REPLY' + + suffix = ',' + if op.value and op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + def render_uapi(family, cw): hdr_prot = f"_UAPI_LINUX_{c_upper(family.uapi_header_name)}_H" hdr_prot = hdr_prot.replace('/', '_') @@ -2440,6 +2523,9 @@ def render_uapi(family, cw): if const['type'] == 'enum' or const['type'] == 'flags': enum = family.consts[const['name']] + if enum.header: + continue + if enum.has_doc(): if enum.has_entry_doc(): cw.p('/**') @@ -2472,9 +2558,12 @@ def render_uapi(family, cw): max_val = f' = {enum.get_mask()},' cw.p(max_name + max_val) else: + cnt_name = enum.enum_cnt_name max_name = c_upper(name_pfx + 'max') - cw.p('__' + max_name + ',') - cw.p(max_name + ' = (__' + max_name + ' - 1)') + if not cnt_name: + cnt_name = '__' + name_pfx + 'max' + cw.p(c_upper(cnt_name) + ',') + cw.p(max_name + ' = (' + c_upper(cnt_name) + ' - 1)') cw.block_end(line=';') cw.nl() elif const['type'] == 'const': @@ -2515,30 +2604,12 @@ def render_uapi(family, cw): # Commands separate_ntf = 'async-prefix' in family['operations'] - max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX")) - cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX")) - max_value = f"({cnt_name} - 1)" - - uapi_enum_start(family, cw, family['operations'], 'enum-name') - val = 0 - for op in family.msgs.values(): - if separate_ntf and ('notify' in op or 'event' in op): - continue - - suffix = ',' - if op.value != val: - suffix = f" = {op.value}," - val = op.value - cw.p(op.enum_name + suffix) - val += 1 - cw.nl() - cw.p(cnt_name + ('' if max_by_define else ',')) - if not max_by_define: - cw.p(f"{max_name} = {max_value}") - cw.block_end(line=';') - if max_by_define: - cw.p(f"#define {max_name} {max_value}") - cw.nl() + if family.msg_id_model == 'unified': + render_uapi_unified(family, cw, max_by_define, separate_ntf) + elif family.msg_id_model == 'directional': + render_uapi_directional(family, cw, max_by_define) + else: + raise Exception(f'Unsupported message enum-model {family.msg_id_model}') if separate_ntf: uapi_enum_start(family, cw, family['operations'], enum_name='async-enum') @@ -2635,7 +2706,8 @@ def find_kernel_root(full_path): def main(): parser = argparse.ArgumentParser(description='Netlink simple parsing generator') - parser.add_argument('--mode', dest='mode', type=str, required=True) + parser.add_argument('--mode', dest='mode', type=str, required=True, + choices=('user', 'kernel', 'uapi')) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--header', dest='header', action='store_true', default=None) parser.add_argument('--source', dest='header', action='store_false') @@ -2662,13 +2734,6 @@ def main(): os.sys.exit(1) return - supported_models = ['unified'] - if args.mode in ['user', 'kernel']: - supported_models += ['directional'] - if parsed.msg_id_model not in supported_models: - print(f'Message enum-model {parsed.msg_id_model} not supported for {args.mode} generation') - os.sys.exit(1) - cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out)) _, spec_kernel = find_kernel_root(args.spec) @@ -2696,7 +2761,10 @@ def main(): cw.p('#define ' + hdr_prot) cw.nl() - hdr_file=os.path.basename(args.out_file[:-2]) + ".h" + if args.out_file: + hdr_file = os.path.basename(args.out_file[:-2]) + ".h" + else: + hdr_file = "generated_header_file.h" if args.mode == 'kernel': cw.p('#include <net/netlink.h>') @@ -2718,12 +2786,17 @@ def main(): else: cw.p(f'#include "{hdr_file}"') cw.p('#include "ynl.h"') - headers = [parsed.uapi_header] + headers = [] for definition in parsed['definitions']: if 'header' in definition: headers.append(definition['header']) + if args.mode == 'user': + headers.append(parsed.uapi_header) + seen_header = [] for one in headers: - cw.p(f"#include <{one}>") + if one not in seen_header: + cw.p(f"#include <{one}>") + seen_header.append(one) cw.nl() if args.mode == "user": diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index cb2fc601de66..f09bd96cc978 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -32,6 +32,7 @@ TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh +TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh TEST_PROGS += nl_netdev.py diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c index 99b0e8c17fca..04c7ff577bb8 100644 --- a/tools/testing/selftests/net/busy_poller.c +++ b/tools/testing/selftests/net/busy_poller.c @@ -54,16 +54,16 @@ struct epoll_params { #define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) #endif -static uint32_t cfg_port = 8000; +static uint16_t cfg_port = 8000; static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY }; static char *cfg_outfile; static int cfg_max_events = 8; -static int cfg_ifindex; +static uint32_t cfg_ifindex; /* busy poll params */ static uint32_t cfg_busy_poll_usecs; -static uint32_t cfg_busy_poll_budget; -static uint32_t cfg_prefer_busy_poll; +static uint16_t cfg_busy_poll_budget; +static uint8_t cfg_prefer_busy_poll; /* IRQ params */ static uint32_t cfg_defer_hard_irqs; @@ -79,6 +79,7 @@ static void usage(const char *filepath) static void parse_opts(int argc, char **argv) { + unsigned long long tmp; int ret; int c; @@ -86,31 +87,40 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) { + /* most options take integer values, except o and b, so reduce + * code duplication a bit for the common case by calling + * strtoull here and leave bounds checking and casting per + * option below. + */ + if (c != 'o' && c != 'b') + tmp = strtoull(optarg, NULL, 0); + switch (c) { case 'u': - cfg_busy_poll_usecs = strtoul(optarg, NULL, 0); - if (cfg_busy_poll_usecs == ULONG_MAX || - cfg_busy_poll_usecs > UINT32_MAX) + if (tmp == ULLONG_MAX || tmp > UINT32_MAX) error(1, ERANGE, "busy_poll_usecs too large"); + + cfg_busy_poll_usecs = (uint32_t)tmp; break; case 'P': - cfg_prefer_busy_poll = strtoul(optarg, NULL, 0); - if (cfg_prefer_busy_poll == ULONG_MAX || - cfg_prefer_busy_poll > 1) + if (tmp == ULLONG_MAX || tmp > 1) error(1, ERANGE, "prefer busy poll should be 0 or 1"); + + cfg_prefer_busy_poll = (uint8_t)tmp; break; case 'g': - cfg_busy_poll_budget = strtoul(optarg, NULL, 0); - if (cfg_busy_poll_budget == ULONG_MAX || - cfg_busy_poll_budget > UINT16_MAX) + if (tmp == ULLONG_MAX || tmp > UINT16_MAX) error(1, ERANGE, "busy poll budget must be [0, UINT16_MAX]"); + + cfg_busy_poll_budget = (uint16_t)tmp; break; case 'p': - cfg_port = strtoul(optarg, NULL, 0); - if (cfg_port > UINT16_MAX) + if (tmp == ULLONG_MAX || tmp > UINT16_MAX) error(1, ERANGE, "port must be <= 65535"); + + cfg_port = (uint16_t)tmp; break; case 'b': ret = inet_aton(optarg, &cfg_bind_addr); @@ -124,41 +134,39 @@ static void parse_opts(int argc, char **argv) error(1, 0, "outfile invalid"); break; case 'm': - cfg_max_events = strtol(optarg, NULL, 0); - - if (cfg_max_events == LONG_MIN || - cfg_max_events == LONG_MAX || - cfg_max_events <= 0) + if (tmp == ULLONG_MAX || tmp > INT_MAX) error(1, ERANGE, - "max events must be > 0 and < LONG_MAX"); + "max events must be > 0 and <= INT_MAX"); + + cfg_max_events = (int)tmp; break; case 'd': - cfg_defer_hard_irqs = strtoul(optarg, NULL, 0); - - if (cfg_defer_hard_irqs == ULONG_MAX || - cfg_defer_hard_irqs > INT32_MAX) + if (tmp == ULLONG_MAX || tmp > INT32_MAX) error(1, ERANGE, "defer_hard_irqs must be <= INT32_MAX"); + + cfg_defer_hard_irqs = (uint32_t)tmp; break; case 'r': - cfg_gro_flush_timeout = strtoull(optarg, NULL, 0); - - if (cfg_gro_flush_timeout == ULLONG_MAX) + if (tmp == ULLONG_MAX || tmp > UINT64_MAX) error(1, ERANGE, - "gro_flush_timeout must be < ULLONG_MAX"); + "gro_flush_timeout must be < UINT64_MAX"); + + cfg_gro_flush_timeout = (uint64_t)tmp; break; case 's': - cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0); - - if (cfg_irq_suspend_timeout == ULLONG_MAX) + if (tmp == ULLONG_MAX || tmp > UINT64_MAX) error(1, ERANGE, "irq_suspend_timeout must be < ULLONG_MAX"); + + cfg_irq_suspend_timeout = (uint64_t)tmp; break; case 'i': - cfg_ifindex = strtoul(optarg, NULL, 0); - if (cfg_ifindex == ULONG_MAX) + if (tmp == ULLONG_MAX || tmp > INT_MAX) error(1, ERANGE, - "ifindex must be < ULONG_MAX"); + "ifindex must be <= INT_MAX"); + + cfg_ifindex = (int)tmp; break; } } @@ -215,7 +223,7 @@ static void setup_queue(void) struct netdev_napi_set_req *set_req = NULL; struct ynl_sock *ys; struct ynl_error yerr; - uint32_t napi_id; + uint32_t napi_id = 0; ys = ynl_sock_create(&ynl_netdev_family, &yerr); if (!ys) @@ -277,8 +285,8 @@ static void run_poller(void) * here */ epoll_params.busy_poll_usecs = cfg_busy_poll_usecs; - epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget; - epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll; + epoll_params.busy_poll_budget = cfg_busy_poll_budget; + epoll_params.prefer_busy_poll = cfg_prefer_busy_poll; epoll_params.__pad = 0; val = 1; @@ -342,5 +350,9 @@ int main(int argc, char *argv[]) parse_opts(argc, argv); setup_queue(); run_poller(); + + if (cfg_outfile) + free(cfg_outfile); + return 0; } diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 876c2db02a63..bc314382e4e1 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -59,6 +59,7 @@ struct options { unsigned int proto; } sock; struct option_cmsg_u32 mark; + struct option_cmsg_u32 priority; struct { bool ena; unsigned int delay; @@ -97,6 +98,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\n" "\t\t-m val Set SO_MARK with given value\n" "\t\t-M val Set SO_MARK via setsockopt\n" + "\t\t-P val Set SO_PRIORITY via setsockopt\n" + "\t\t-Q val Set SO_PRIORITY via cmsg\n" "\t\t-d val Set SO_TXTIME with given delay (usec)\n" "\t\t-t Enable time stamp reporting\n" "\t\t-f val Set don't fragment via cmsg\n" @@ -115,7 +118,7 @@ static void cs_parse_args(int argc, char *argv[]) { int o; - while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:")) != -1) { + while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:Q:")) != -1) { switch (o) { case 's': opt.silent_send = true; @@ -148,6 +151,10 @@ static void cs_parse_args(int argc, char *argv[]) opt.mark.ena = true; opt.mark.val = atoi(optarg); break; + case 'Q': + opt.priority.ena = true; + opt.priority.val = atoi(optarg); + break; case 'M': opt.sockopt.mark = atoi(optarg); break; @@ -253,6 +260,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_SOCKET, SO_MARK, &opt.mark); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_SOCKET, SO_PRIORITY, &opt.priority); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_IPV6, IPV6_TCLASS, &opt.v6.tclass); diff --git a/tools/testing/selftests/net/cmsg_so_priority.sh b/tools/testing/selftests/net/cmsg_so_priority.sh new file mode 100755 index 000000000000..ee07d8653262 --- /dev/null +++ b/tools/testing/selftests/net/cmsg_so_priority.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +readonly KSFT_SKIP=4 + +IP4=192.0.2.1/24 +TGT4=192.0.2.2 +TGT4_RAW=192.0.2.3 +IP6=2001:db8::1/64 +TGT6=2001:db8::2 +TGT6_RAW=2001:db8::3 +PORT=1234 +TOTAL_TESTS=0 +FAILED_TESTS=0 + +if ! command -v jq &> /dev/null; then + echo "SKIP cmsg_so_priroity.sh test: jq is not installed." >&2 + exit "$KSFT_SKIP" +fi + +check_result() { + ((TOTAL_TESTS++)) + if [ "$1" -ne 0 ]; then + ((FAILED_TESTS++)) + fi +} + +cleanup() +{ + cleanup_ns $NS +} + +trap cleanup EXIT + +setup_ns NS + +create_filter() { + local handle=$1 + local vlan_prio=$2 + local ip_type=$3 + local proto=$4 + local dst_ip=$5 + local ip_proto + + if [[ "$proto" == "u" ]]; then + ip_proto="udp" + elif [[ "$ip_type" == "ipv4" && "$proto" == "i" ]]; then + ip_proto="icmp" + elif [[ "$ip_type" == "ipv6" && "$proto" == "i" ]]; then + ip_proto="icmpv6" + fi + + tc -n $NS filter add dev dummy1 \ + egress pref 1 handle "$handle" proto 802.1q \ + flower vlan_prio "$vlan_prio" vlan_ethtype "$ip_type" \ + dst_ip "$dst_ip" ${ip_proto:+ip_proto $ip_proto} \ + action pass +} + +ip -n $NS link set dev lo up +ip -n $NS link add name dummy1 up type dummy + +ip -n $NS link add link dummy1 name dummy1.10 up type vlan id 10 \ + egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 + +ip -n $NS address add $IP4 dev dummy1.10 +ip -n $NS address add $IP6 dev dummy1.10 nodad + +ip netns exec $NS sysctl -wq net.ipv4.ping_group_range='0 2147483647' + +ip -n $NS neigh add $TGT4 lladdr 00:11:22:33:44:55 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT6 lladdr 00:11:22:33:44:55 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT4_RAW lladdr 00:11:22:33:44:66 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT6_RAW lladdr 00:11:22:33:44:66 nud permanent \ + dev dummy1.10 + +tc -n $NS qdisc add dev dummy1 clsact + +FILTER_COUNTER=10 + +for i in 4 6; do + for proto in u i r; do + echo "Test IPV$i, prot: $proto" + for priority in {0..7}; do + if [[ $i == 4 && $proto == "r" ]]; then + TGT=$TGT4_RAW + elif [[ $i == 6 && $proto == "r" ]]; then + TGT=$TGT6_RAW + elif [ $i == 4 ]; then + TGT=$TGT4 + else + TGT=$TGT6 + fi + + handle="${FILTER_COUNTER}${priority}" + + create_filter $handle $priority ipv$i $proto $TGT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + + if [[ $pkts == 0 ]]; then + check_result 0 + else + echo "prio $priority: expected 0, got $pkts" + check_result 1 + fi + + ip netns exec $NS ./cmsg_sender -$i -Q $priority \ + -p $proto $TGT $PORT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + if [[ $pkts == 1 ]]; then + check_result 0 + else + echo "prio $priority -Q: expected 1, got $pkts" + check_result 1 + fi + + ip netns exec $NS ./cmsg_sender -$i -P $priority \ + -p $proto $TGT $PORT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + if [[ $pkts == 2 ]]; then + check_result 0 + else + echo "prio $priority -P: expected 2, got $pkts" + check_result 1 + fi + done + FILTER_COUNTER=$((FILTER_COUNTER + 10)) + done +done + +if [ $FAILED_TESTS -ne 0 ]; then + echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed" + exit 1 +else + echo "OK - All $TOTAL_TESTS tests passed" + exit 0 +fi diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh index c03151e7791c..c159230c9b62 100755 --- a/tools/testing/selftests/net/fdb_notify.sh +++ b/tools/testing/selftests/net/fdb_notify.sh @@ -49,7 +49,7 @@ test_dup_vxlan_self() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan" dev vx self dst 192.0.2.1 do_test_dup del "vxlan" dev vx self dst 192.0.2.1 @@ -59,7 +59,7 @@ test_dup_vxlan_master() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan master" dev vx master do_test_dup del "vxlan master" dev vx master @@ -79,7 +79,7 @@ test_dup_macvlan_master() ip_link_add br up type bridge vlan_filtering 1 ip_link_add dd up type dummy ip_link_add mv up link dd type macvlan mode passthru - ip_link_master mv br + ip_link_set_master mv br do_test_dup add "macvlan master" dev mv self do_test_dup del "macvlan master" dev mv self diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 1d58b3b87465..847936363a12 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -291,6 +291,37 @@ fib_rule6_test() "$getnomatch" "iif dscp redirect to table" \ "iif dscp no redirect to table" fi + + fib_check_iproute_support "flowlabel" "flowlabel" + if [ $? -eq 0 ]; then + match="flowlabel 0xfffff" + getmatch="flowlabel 0xfffff" + getnomatch="flowlabel 0xf" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "flowlabel redirect to table" \ + "flowlabel no redirect to table" + + match="flowlabel 0xfffff" + getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff" + getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif flowlabel redirect to table" \ + "iif flowlabel no redirect to table" + + match="flowlabel 0x08000/0x08000" + getmatch="flowlabel 0xfffff" + getnomatch="flowlabel 0xf7fff" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "flowlabel masked redirect to table" \ + "flowlabel masked no redirect to table" + + match="flowlabel 0x08000/0x08000" + getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff" + getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf7fff" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif flowlabel masked redirect to table" \ + "iif flowlabel masked no redirect to table" + fi } fib_rule6_vrf_test() diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 7d885cff8d79..00bde7b6f39e 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ + vxlan_reserved.sh \ vxlan_symmetric_ipv6.sh \ vxlan_symmetric.sh diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh index 1c8a26046589..2b5700b61ffa 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding pvid_change" NUM_NETIFS=4 source lib.sh @@ -77,12 +77,16 @@ cleanup() ping_ipv4() { - ping_test $h1 192.0.2.2 + local msg=$1 + + ping_test $h1 192.0.2.2 "$msg" } ping_ipv6() { - ping6_test $h1 2001:db8:1::2 + local msg=$1 + + ping6_test $h1 2001:db8:1::2 "$msg" } learning() @@ -95,6 +99,21 @@ flooding() flood_test $swp2 $h1 $h2 } +pvid_change() +{ + # Test that the changing of the VLAN-aware PVID does not affect + # VLAN-unaware forwarding + bridge vlan add vid 3 dev $swp1 pvid untagged + + ping_ipv4 " with bridge port $swp1 PVID changed" + ping_ipv6 " with bridge port $swp1 PVID changed" + + bridge vlan del vid 3 dev $swp1 + + ping_ipv4 " with bridge port $swp1 PVID deleted" + ping_ipv6 " with bridge port $swp1 PVID deleted" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 7337f398f9cc..1fd40bada694 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -932,13 +932,6 @@ packets_rate() echo $(((t1 - t0) / interval)) } -mac_get() -{ - local if_name=$1 - - ip -j link show dev $if_name | jq -r '.[]["address"]' -} - ether_addr_to_u64() { local addr="$1" diff --git a/tools/testing/selftests/net/forwarding/vxlan_reserved.sh b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh new file mode 100755 index 000000000000..46c31794b91b --- /dev/null +++ b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh @@ -0,0 +1,352 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/28 | +# +----|---------------+ +# | +# +----|--------------------------------+ +# | SW | | +# | +--|------------------------------+ | +# | | + $swp1 BR1 (802.1d) | | +# | | | | +# | | + vx1 (vxlan) | | +# | | local 192.0.2.17 | | +# | | id 1000 dstport $VXPORT | | +# | +---------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +--|----------------------------------+ +# | +# +--|----------------------------------+ +# | | | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | +# | VRP2 (vrf) | +# +-------------------------------------+ + +: ${VXPORT:=4789} +: ${ALL_TESTS:=" + default_test + plain_test + reserved_0_test + reserved_10_test + reserved_31_test + reserved_56_test + reserved_63_test + "} + +NUM_NETIFS=4 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + defer simple_if_fini $h1 192.0.2.1/28 + + tc qdisc add dev $h1 clsact + defer tc qdisc del dev $h1 clsact + + tc filter add dev $h1 ingress pref 77 \ + prot ip flower skip_hw ip_proto icmp action drop + defer tc filter del dev $h1 ingress pref 77 +} + +switch_create() +{ + ip_link_add br1 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip_link_set_addr br1 $(mac_get $swp1) + ip_link_set_up br1 + + ip_link_set_up $rp1 + ip_addr_add $rp1 192.0.2.17/28 + ip_route_add 192.0.2.32/28 nexthop via 192.0.2.18 + + ip_link_set_master $swp1 br1 + ip_link_set_up $swp1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + defer simple_if_fini $rp2 192.0.2.18/28 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + vrf_prepare + defer vrf_cleanup + + forwarding_enable + defer forwarding_restore + + h1_create + switch_create + + vrp2_create +} + +vxlan_header_bytes() +{ + local vni=$1; shift + local -a extra_bits=("$@") + local -a bits + local i + + for ((i=0; i < 64; i++)); do + bits[i]=0 + done + + # Bit 4 is the I flag and is always on. + bits[4]=1 + + for i in ${extra_bits[@]}; do + bits[i]=1 + done + + # Bits 32..55 carry the VNI + local mask=0x800000 + for ((i=0; i < 24; i++)); do + bits[$((i + 32))]=$(((vni & mask) != 0)) + ((mask >>= 1)) + done + + local bytes + for ((i=0; i < 8; i++)); do + local byte=0 + local j + for ((j=0; j < 8; j++)); do + local bit=${bits[8 * i + j]} + ((byte += bit << (7 - j))) + done + bytes+=$(printf %02x $byte): + done + + echo ${bytes%:} +} + +neg_bytes() +{ + local bytes=$1; shift + + local -A neg=([0]=f [1]=e [2]=d [3]=c [4]=b [5]=a [6]=9 [7]=8 + [8]=7 [9]=6 [a]=5 [b]=4 [c]=3 [d]=2 [e]=1 [f]=0 [:]=:) + local out + local i + + for ((i=0; i < ${#bytes}; i++)); do + local c=${bytes:$i:1} + out+=${neg[$c]} + done + echo $out +} + +vxlan_ping_do() +{ + local count=$1; shift + local dev=$1; shift + local next_hop_mac=$1; shift + local dest_ip=$1; shift + local dest_mac=$1; shift + local vni=$1; shift + local reserved_bits=$1; shift + + local vxlan_header=$(vxlan_header_bytes $vni $reserved_bits) + + $MZ $dev -c $count -d 100msec -q \ + -b $next_hop_mac -B $dest_ip \ + -t udp sp=23456,dp=$VXPORT,p=$(: + )"$vxlan_header:"$( : VXLAN + )"$dest_mac:"$( : ETH daddr + )"00:11:22:33:44:55:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:54:"$( : IP total length + )"99:83:"$( : IP identification + )"40:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"01:"$( : IP proto + )"00:00:"$( : IP header csum + )"$(ipv4_to_bytes 192.0.2.3):"$( : IP saddr + )"$(ipv4_to_bytes 192.0.2.1):"$( : IP daddr + )"08:"$( : ICMP type + )"00:"$( : ICMP code + )"8b:f2:"$( : ICMP csum + )"1f:6a:"$( : ICMP request identifier + )"00:01:"$( : ICMP request seq. number + )"4f:ff:c5:5b:00:00:00:00:"$( : ICMP payload + )"6d:74:0b:00:00:00:00:00:"$( : + )"10:11:12:13:14:15:16:17:"$( : + )"18:19:1a:1b:1c:1d:1e:1f:"$( : + )"20:21:22:23:24:25:26:27:"$( : + )"28:29:2a:2b:2c:2d:2e:2f:"$( : + )"30:31:32:33:34:35:36:37" +} + +vxlan_device_add() +{ + ip_link_add vx1 up type vxlan id 1000 \ + local 192.0.2.17 dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 100 "$@" + ip_link_set_master vx1 br1 +} + +vxlan_all_reserved_bits() +{ + local i + + for ((i=0; i < 64; i++)); do + if ((i == 4 || i >= 32 && i < 56)); then + continue + fi + echo $i + done +} + +vxlan_ping_vanilla() +{ + vxlan_ping_do 10 $rp2 $(mac_get $rp1) 192.0.2.17 $(mac_get $h1) 1000 +} + +vxlan_ping_reserved() +{ + for bit in $(vxlan_all_reserved_bits); do + vxlan_ping_do 1 $rp2 $(mac_get $rp1) \ + 192.0.2.17 $(mac_get $h1) 1000 "$bit" + ((n++)) + done +} + +vxlan_ping_test() +{ + local what=$1; shift + local get_stat=$1; shift + local expect=$1; shift + + RET=0 + + local t0=$($get_stat) + + "$@" + check_err $? "Failure when running $@" + + local t1=$($get_stat) + local delta=$((t1 - t0)) + + ((expect == delta)) + check_err $? "Expected to capture $expect packets, got $delta." + + log_test "$what" +} + +__default_test_do() +{ + local n_allowed_bits=$1; shift + local what=$1; shift + + vxlan_ping_test "$what: clean packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + 10 vxlan_ping_vanilla + + local t0=$(link_stats_get vx1 rx errors) + vxlan_ping_test "$what: mangled packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + $n_allowed_bits vxlan_ping_reserved + local t1=$(link_stats_get vx1 rx errors) + + RET=0 + local expect=$((39 - n_allowed_bits)) + local delta=$((t1 - t0)) + ((expect == delta)) + check_err $? "Expected $expect error packets, got $delta." + log_test "$what: drops reported" +} + +default_test_do() +{ + vxlan_device_add + __default_test_do 0 "Default" +} + +default_test() +{ + in_defer_scope \ + default_test_do +} + +plain_test_do() +{ + vxlan_device_add reserved_bits 0xf7ffffff000000ff + __default_test_do 0 "reserved_bits 0xf7ffffff000000ff" +} + +plain_test() +{ + in_defer_scope \ + plain_test_do +} + +reserved_test() +{ + local bit=$1; shift + + local allowed_bytes=$(vxlan_header_bytes 0xffffff $bit) + local reserved_bytes=$(neg_bytes $allowed_bytes) + local reserved_bits=${reserved_bytes//:/} + + vxlan_device_add reserved_bits 0x$reserved_bits + __default_test_do 1 "reserved_bits 0x$reserved_bits" +} + +reserved_0_test() +{ + in_defer_scope \ + reserved_test 0 +} + +reserved_10_test() +{ + in_defer_scope \ + reserved_test 10 +} + +reserved_31_test() +{ + in_defer_scope \ + reserved_test 31 +} + +reserved_56_test() +{ + in_defer_scope \ + reserved_test 56 +} + +reserved_63_test() +{ + in_defer_scope \ + reserved_test 63 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 8994fec1c38f..2cd5c743b2d9 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -435,6 +435,13 @@ xfail_on_veth() fi } +mac_get() +{ + local if_name=$1 + + ip -j link show dev $if_name | jq -r '.[]["address"]' +} + kill_process() { local pid=$1; shift @@ -451,7 +458,7 @@ ip_link_add() defer ip link del dev "$name" } -ip_link_master() +ip_link_set_master() { local member=$1; shift local master=$1; shift @@ -459,3 +466,35 @@ ip_link_master() ip link set dev "$member" master "$master" defer ip link set dev "$member" nomaster } + +ip_link_set_addr() +{ + local name=$1; shift + local addr=$1; shift + + local old_addr=$(mac_get "$name") + ip link set dev "$name" address "$addr" + defer ip link set dev "$name" address "$old_addr" +} + +ip_link_set_up() +{ + local name=$1; shift + + ip link set dev "$name" up + defer ip link set dev "$name" down +} + +ip_addr_add() +{ + local name=$1; shift + + ip addr add dev "$name" "$@" + defer ip addr del dev "$name" "$@" +} + +ip_route_add() +{ + ip route add "$@" + defer ip route del "$@" +} diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt new file mode 100644 index 000000000000..38535701656e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking accept. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +0...0.200 accept(3, ..., ...) = 4 + + +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + + +.1 write(4, ..., 2000) = 2000 + +0 > P. 1:2001(2000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt new file mode 100644 index 000000000000..3692ef102381 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking connect. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + + +.1...0.200 connect(3, ..., ...) = 0 + + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.1 < S. 0:0(0) ack 1 win 5792 <mss 1460,nop,wscale 2,nop,nop,sackOK> + +0 > . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt new file mode 100644 index 000000000000..914eabab367a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking read. +--tolerance_usecs=10000 + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + + +0...0.100 read(4, ..., 2000) = 2000 + +.1 < P. 1:2001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 2001 + + +.1...0.200 read(4, ..., 2000) = 2000 + +.1 < P. 2001:4001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 + + +.1 < P. 4001:6001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 6001 + +0...0.000 read(4, ..., 1000) = 1000 + +0...0.000 read(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt new file mode 100644 index 000000000000..cec5a0725d95 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking write. +--tolerance_usecs=10000 + +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_min_tso_segs=10 +` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 50000 <mss 1000,nop,wscale 0> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 50000 + +0 accept(3, ..., ...) = 4 + +// Kernel doubles our value -> sk->sk_sndbuf is set to 42000 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [21000], 4) = 0 + +0 getsockopt(4, SOL_SOCKET, SO_SNDBUF, [42000], [4]) = 0 + +// A write of 60000 does not block. + +0...0.300 write(4, ..., 61000) = 61000 // this write() blocks + + +.1 < . 1:1(0) ack 10001 win 50000 + + +.1 < . 1:1(0) ack 30001 win 50000 + +// This ACK should wakeup the write(). An ACK of 35001 does not. + +.1 < . 1:1(0) ack 36001 win 50000 + +// Reset to sysctls defaults. +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt new file mode 100644 index 000000000000..8514d6bdbb6d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test basic connection teardown where local process closes first: +// the local process calls close() first, so we send a FIN, and receive an ACK. +// Then we receive a FIN and ACK it. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +.01...0.011 connect(3, ..., ...) = 0 + +0 > S 0:0(0) <...> + +0 < S. 0:0(0) ack 1 win 32768 <mss 1000,nop,wscale 6,nop,nop,sackOK> + +0 > . 1:1(0) ack 1 + + +0 write(3, ..., 1000) = 1000 + +0 > P. 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1001 win 257 + + +0 close(3) = 0 + +0 > F. 1001:1001(0) ack 1 + +0 < . 1:1(0) ack 1002 win 257 + + +0 < F. 1:1(0) ack 1002 win 257 + +0 > . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt new file mode 100644 index 000000000000..04103134bd99 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test to make sure no RST is being sent when close() +// is called on a socket with SYN_SENT state. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <...> + +// Application decideds to close the socket in SYN_SENT state +// Make sure no RST is sent after close(). + +0 close(3) = 0 + +// Receive syn-ack to trigger the send side packet examination: +// If a RESET were sent right after close(), it would have failed with +// a mismatched timestamp. + +.1 < S. 0:0(0) ack 1 win 32000 <mss 1460,nop,wscale 7> + +0 > R 1:1(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt new file mode 100644 index 000000000000..5f3a2914213a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify behavior for the sequence: remote side sends FIN, then we close(). +// Since the remote side (client) closes first, we test our LAST_ACK code path. + +`./defaults.sh` + +// Initialize a server socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +// Client closes first. + +.01 < F. 1:1(0) ack 1 win 257 + +0 > . 1:1(0) ack 2 + +// App notices that client closed. + +0 read(4, ..., 1000) = 0 + +// Then we close. + +.01 close(4) = 0 + +0 > F. 1:1(0) ack 2 + +// Client ACKs our FIN. + +.01 < . 2:2(0) ack 2 win 257 + +// Verify that we send RST in response to any incoming segments +// (because the kernel no longer has any record of this socket). + +.01 < . 2:2(0) ack 2 win 257 + +0 > R 2:2(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt new file mode 100644 index 000000000000..643baf3267cf --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test ECN: verify that Linux TCP ECN sending code uses ECT0 (not ECT1). +// +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=1 # fully enabled +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +// ECN handshake: send EW flags in SYN packet, E flag in SYN-ACK response ++.002 ... 0.004 connect(4, ..., ...) = 0 + + +0 > SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.002 < SE. 0:0(0) ack 1 win 32767 <mss 1000,nop,wscale 6,nop,nop,sackOK> + +0 > . 1:1(0) ack 1 + +// Write 1 MSS. ++.002 write(4, ..., 1000) = 1000 +// Send 1 MSS with ect0. + +0 > [ect0] P. 1:1001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt new file mode 100644 index 000000000000..f95b9b3c9fa1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. The large chunk itself should be packetized as +// usual. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write another 10040B chunk with no coalescing options. + +0 send(4, ..., 10400, MSG_EOR) = 10400 + +// Write a 2KB chunk. This chunk should not be appended to the packets created +// the previous chunk. + +0 write(4, ..., 2000) = 2000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:20801(10800) ack 1 ++.001 < . 1:1(0) ack 20801 win 514 +// This 2KB packet should be sent alone. + +0 > P. 20801:22801(2000) ack 1 ++.001 < . 1:1(0) ack 22801 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt new file mode 100644 index 000000000000..2ff66075288e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. Also, when packets are retransmitted, they +// will not be coalesce into the same skb. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write 10 400B chunks with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 +// The 9 remaining 400B chunks should be sent as individual packets. + +0 > P. 10801:11201(400) ack 1 + +0 > P. 11201:11601(400) ack 1 + +0 > P. 11601:12001(400) ack 1 + +0 > P. 12001:12401(400) ack 1 + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 + +0 > P. 13201:13601(400) ack 1 + +0 > P. 13601:14001(400) ack 1 + +0 > P. 14001:14401(400) ack 1 +// The last 10KB chunk should be sent separately. + +0 > P. 14401:24401(10000) ack 1 + ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 11201 win 514 ++.001 < . 1:1(0) ack 11601 win 514 ++.001 < . 1:1(0) ack 12001 win 514 <sack 13201:14401,nop,nop> +// TCP should fill the hole but no coalescing should happen, and all +// retransmissions should be sent out as individual packets. + +// Note : This is timeout based retransmit. +// Do not put +0 here or flakes will come back. ++.004~+.008 > P. 12001:12401(400) ack 1 + ++.001 < . 1:1(0) ack 12401 win 514 <sack 13201:14401,nop,nop> + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 ++.001 < . 1:1(0) ack 12801 win 514 <sack 13201:14401,nop,nop> ++.001 < . 1:1(0) ack 14401 win 514 ++.001 < . 1:1(0) ack 24401 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt new file mode 100644 index 000000000000..77039c5aac39 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write a 400B chunk with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 + +0 > P. 10801:20801(10000) ack 1 ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 20801 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt new file mode 100644 index 000000000000..dd5a06250595 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk even though we have 10 back-to-back small +// writes. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write 10 400B chunks with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 +// The 9 remaining 400B chunks should be sent as individual packets. + +0 > P. 10801:11201(400) ack 1 + +0 > P. 11201:11601(400) ack 1 + +0 > P. 11601:12001(400) ack 1 + +0 > P. 12001:12401(400) ack 1 + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 + +0 > P. 13201:13601(400) ack 1 + +0 > P. 13601:14001(400) ack 1 + +0 > P. 14001:14401(400) ack 1 +// The last 10KB chunk should be sent separately. + +0 > P. 14401:24401(10000) ack 1 + ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 11201 win 514 ++.001 < . 1:1(0) ack 11601 win 514 ++.001 < . 1:1(0) ack 12001 win 514 ++.001 < . 1:1(0) ack 12401 win 514 ++.001 < . 1:1(0) ack 12801 win 514 ++.001 < . 1:1(0) ack 13201 win 514 ++.001 < . 1:1(0) ack 13601 win 514 ++.001 < . 1:1(0) ack 14001 win 514 ++.001 < . 1:1(0) ack 14401 win 514 ++.001 < . 1:1(0) ack 24401 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt new file mode 100644 index 000000000000..0d3c8077e830 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. +// In this variant we test a simple case where in-flight == ssthresh +// all the way through recovery, so during fast recovery we send one segment +// for each segment SACKed/ACKed. + +// Set up config. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> +// RTT 100ms + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 10 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1:1001. + +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop> + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop> + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:4001,nop,nop> +// Enter fast recovery. + +0 > . 1:1001(1000) ack 1 + +.01 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd +assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh +}% + +// Write some more, which we will send 1 MSS at a time, +// as in-flight segments are SACKed or ACKed. + +.01 write(4, ..., 7000) = 7000 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:5001,nop,nop> + +0 > . 10001:11001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:6001,nop,nop> + +0 > . 11001:12001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:7001,nop,nop> + +0 > . 12001:13001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:8001,nop,nop> + +0 > . 13001:14001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:9001,nop,nop> + +0 > . 14001:15001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:10001,nop,nop> + +0 > . 15001:16001(1000) ack 1 + + +.02 < . 1:1(0) ack 10001 win 320 + +0 > P. 16001:17001(1000) ack 1 +// Leave fast recovery. + +.01 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd +assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh +}% + + +.03 < . 1:1(0) ack 12001 win 320 + +.02 < . 1:1(0) ack 14001 win 320 + +.02 < . 1:1(0) ack 16001 win 320 + +.02 < . 1:1(0) ack 17001 win 320 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt new file mode 100644 index 000000000000..7842a10b6967 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. The sender sends 20 packets. Packet +// 1 to 4, and 11 to 16 are dropped. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write 20 data segments. + +0 write(4, ..., 20000) = 20000 + +0 > P. 1:10001(10000) ack 1 + +// Receive first DUPACK, entering PRR part + +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop> + +0 > . 10001:11001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop> + +0 > . 11001:12001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop> + +0 > . 1:1001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop> + +0 > . 1001:2001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop> + +0 > . 2001:3001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop> + +0 > . 3001:4001(1000) ack 1 +// Enter PRR CRB ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:11001,nop,nop> + +0 > . 12001:13001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:12001,nop,nop> + +0 > . 13001:14001(1000) ack 1 +// Enter PRR slow start + +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:12001,nop,nop> + +0 > P. 14001:16001(2000) ack 1 ++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:12001,nop,nop> + +0 > . 1001:2001(1000) ack 1 + +0 > . 16001:17001(1000) ack 1 +// inflight reaches ssthresh, goes into packet conservation mode ++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:13001,nop,nop> + +0 > . 17001:18001(1000) ack 1 ++.002 < . 1:1(0) ack 1001 win 320 <sack 2001:14001,nop,nop> + +0 > . 18001:19001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt new file mode 100644 index 000000000000..b66d7644c3b6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. The sender sends 20 packets. Packet +// 1 to 4 are lost. The sender writes another 10 packets. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 20 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1,2,3,4 + +.01 < . 1:1(0) ack 1 win 320 <sack 4001:5001,nop,nop> ++.002 < . 1:1(0) ack 1 win 320 <sack 4001:6001,nop,nop> + +0 < . 1:1(0) ack 1 win 320 <sack 4001:7001,nop,nop> + +0 > . 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 <sack 4001:8001,nop,nop> + +0 > . 1001:2001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 <sack 4001:9001,nop,nop> + +0 > . 2001:3001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 <sack 4001:10001,nop,nop> + +0 > . 3001:4001(1000) ack 1 + +// Receiver ACKs all data. + +.01 < . 1:1(0) ack 1001 win 320 <sack 4001:10001,nop,nop> + +0 < . 1:1(0) ack 2001 win 320 <sack 4001:10001,nop,nop> + +0 < . 1:1(0) ack 3001 win 320 <sack 4001:10001,nop,nop> + +0 < . 1:1(0) ack 10001 win 320 + +// Writes another 10 packets, which the ssthresh*mss amount +// should be sent right away + +.01 write(4, ..., 10000) = 10000 + +0 > . 10001:17001(7000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt new file mode 100644 index 000000000000..8e87bfecabb5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. +// In this variant we verify that the sender uses SACK info on an ACK +// below snd_una. + +// Set up config. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 8> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> +// RTT 10ms + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 10 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1:1001,4001:5001,7001:8001. + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop> + +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop> + +0 < . 1:1(0) ack 1 win 320 <sack 1001:3001 8001:9001,nop,nop> + +0 > . 1:1001(1000) ack 1 + ++.012 < . 1:1(0) ack 4001 win 320 <sack 8001:9001,nop,nop> + +0 > . 4001:7001(3000) ack 1 + + +0 write(4, ..., 10000) = 10000 + +// The following ACK was reordered - delayed so that it arrives with +// an ACK field below snd_una. Here we check that the newly-SACKed +// 2MSS at 5001:7001 cause us to send out 2 more MSS. ++.002 < . 1:1(0) ack 3001 win 320 <sack 5001:7001,nop,nop> + +0 > . 7001:8001(1000) ack 1 + +0 > . 10001:11001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt new file mode 100644 index 000000000000..96b01eb5b7a4 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test RFC 3042 "Limited Transmit": "sending a new data segment in +// response to each of the first two duplicate acknowledgments that +// arrive at the sender". +// This variation tests a receiver that doesn't support SACK. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write some data, and send the initial congestion window. + +0 write(4, ..., 15000) = 15000 + +0 > P. 1:10001(10000) ack 1 + +// Limited transmit: on first dupack, send a new data segment. + +.11 < . 1:1(0) ack 1 win 320 + +0 > . 10001:11001(1000) ack 1 + +// Limited transmit: on second dupack, send a new data segment. + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 11001:12001(1000) ack 1 + +// It turned out to be reordering, not loss. +// We have one packet newly acked (1001:3001 were DUP-ACK'd) +// So we revert state back to Open. Slow start cwnd from 10 to 11 +// and send 11 - 9 = 2 packets + +.01 < . 1:1(0) ack 3001 win 320 + +0 > P. 12001:14001(2000) ack 1 + + +.02 < . 1:1(0) ack 5001 win 320 + +0 > P. 14001:15001(1000) ack 1 + +// Client gradually ACKs all data. + +.02 < . 1:1(0) ack 7001 win 320 + +.02 < . 1:1(0) ack 9001 win 320 + +.02 < . 1:1(0) ack 11001 win 320 + +.02 < . 1:1(0) ack 13001 win 320 + +.02 < . 1:1(0) ack 15001 win 320 + +// Clean up. + +.17 close(4) = 0 + +0 > F. 15001:15001(0) ack 1 + +.1 < F. 1:1(0) ack 15002 win 257 + +0 > . 15002:15002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt new file mode 100644 index 000000000000..642da51ec3a4 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test RFC 3042 "Limited Transmit": "sending a new data segment in +// response to each of the first two duplicate acknowledgments that +// arrive at the sender". +// This variation tests a receiver that supports SACK. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write some data, and send the initial congestion window. + +0 write(4, ..., 15000) = 15000 + +0 > P. 1:10001(10000) ack 1 + +// Limited transmit: on first dupack, send a new data segment. + +.11 < . 1:1(0) ack 1 win 320 <sack 1001:2001,nop,nop> + +0 > . 10001:11001(1000) ack 1 + +// Limited transmit: on second dupack, send a new data segment. + +.01 < . 1:1(0) ack 1 win 320 <sack 1001:3001,nop,nop> + +0 > . 11001:12001(1000) ack 1 + +// It turned out to be reordering, not loss. + +.01 < . 1:1(0) ack 3001 win 320 + +0 > P. 12001:14001(2000) ack 1 + + +.02 < . 1:1(0) ack 5001 win 320 + +0 > P. 14001:15001(1000) ack 1 + +// Client gradually ACKs all data. + +.02 < . 1:1(0) ack 7001 win 320 + +.02 < . 1:1(0) ack 9001 win 320 + +.02 < . 1:1(0) ack 11001 win 320 + +.02 < . 1:1(0) ack 13001 win 320 + +.02 < . 1:1(0) ack 15001 win 320 + +// Clean up. + +.17 close(4) = 0 + +0 > F. 15001:15001(0) ack 1 + +.1 < F. 1:1(0) ack 15002 win 257 + +0 > . 15002:15002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt new file mode 100644 index 000000000000..7adae7a9ef4a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +// This is a test inspired by an Android client app using SSL. This +// test verifies using TCP_NODELAY would save application latency +// (Perhaps even better with TCP_NAGLE). +// +`./defaults.sh +ethtool -K tun0 tso off gso off +./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < S. 0:0(0) ack 1 win 5792 <mss 974,nop,nop,sackOK,nop,wscale 7> + +0 > . 1:1(0) ack 1 + +// SSL handshake (resumed session) + +0 write(4, ..., 517) = 517 + +0 > P. 1:518(517) ack 1 + +.1 < . 1:1(0) ack 518 win 229 + + +0 < P. 1:144(143) ack 1 win 229 + +0 > . 518:518(0) ack 144 + +0 read(4, ..., 1000) = 143 + +// Application POST header (51B) and body (2002B) + +0 write(4, ..., 51) = 51 + +0 > P. 518:569(51) ack 144 + +.03 write(4, ..., 2002) = 2002 + +0 > . 569:1543(974) ack 144 + +0 > P. 1543:2517(974) ack 144 +// Without disabling Nagle, this packet will not happen until the remote ACK. + +0 > P. 2517:2571(54) ack 144 + + +.1 < . 1:1(0) ack 2571 win 229 + +// Reset sysctls +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt new file mode 100644 index 000000000000..fa9c01813996 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test the MSG_MORE flag will correctly corks the tiny writes +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 +// Disable Nagle by default on this socket. + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +// Test the basic case: MSG_MORE overwrites TCP_NODELAY and enables Nagle. + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 40}], msg_flags=0}, MSG_MORE) = 40 + +.21~+.215 > P. 1:41(40) ack 1 + +.01 < . 1:1(0) ack 41 win 257 + +// Test unsetting MSG_MORE releases the packet + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 100}], msg_flags=0}, MSG_MORE) = 100 ++.005 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 160}], msg_flags=0}, MSG_MORE) = 160 + +.01 sendmsg(4, {msg_name(...)=..., + msg_iov(3)=[{..., 100}, {..., 200}, {..., 195}], + msg_flags=0}, MSG_MORE) = 495 ++.008 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 5}], msg_flags=0}, 0) = 5 + +0 > P. 41:801(760) ack 1 + +.02 < . 1:1(0) ack 801 win 257 + + +// Test >MSS write will unleash MSS packets but hold on the remaining data. + +.1 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 3100}], msg_flags=0}, MSG_MORE) = 3100 + +0 > . 801:3801(3000) ack 1 ++.003 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 50}], msg_flags=0}, MSG_MORE) = 50 + + +.01 < . 1:1(0) ack 2801 win 257 +// Err... we relase the remaining right after the ACK? note that PUSH is reset + +0 > . 3801:3951(150) ack 1 + +// Test we'll hold on the subsequent writes when inflight (3801:3951) > 0 ++.001 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 1}], msg_flags=0}, MSG_MORE) = 1 ++.002 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 2}], msg_flags=0}, MSG_MORE) = 2 ++.003 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 3}], msg_flags=0}, MSG_MORE) = 3 ++.004 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 4}], msg_flags=0}, MSG_MORE) = 4 + +.02 < . 1:1(0) ack 3951 win 257 + +0 > . 3951:3961(10) ack 1 + +.02 < . 1:1(0) ack 3961 win 257 + + +// Test the case a MSG_MORE send followed by a write flushes the data + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 20}], msg_flags=0}, MSG_MORE) = 20 + +.05 write(4, ..., 20) = 20 + +0 > P. 3961:4001(40) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt new file mode 100644 index 000000000000..0ddec5f7dc1a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP_CORK and TCP_NODELAY sockopt behavior +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 +// Set TCP_CORK sockopt to hold small packets + +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0 + + +0 write(4, ..., 40) = 40 + +.05 write(4, ..., 40) = 40 + +// Unset TCP_CORK should push pending bytes out + +.01 setsockopt(4, SOL_TCP, TCP_CORK, [0], 4) = 0 + +0 > P. 1:81(80) ack 1 + +.01 < . 1:1(0) ack 81 win 257 + +// Set TCP_CORK sockopt to hold small packets + +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0 + + +0 write(4, ..., 40) = 40 + +.05 write(4, ..., 40) = 40 + +// Set TCP_NODELAY sockopt should push pending bytes out + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +0 > P. 81:161(80) ack 1 + +.01 < . 1:1(0) ack 161 win 257 + +// Set MSG_MORE to hold small packets + +0 send(4, ..., 40, MSG_MORE) = 40 + +.05 send(4, ..., 40, MSG_MORE) = 40 + +// Set TCP_NODELAY sockopt should push pending bytes out + +.01 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +0 > . 161:241(80) ack 1 + +.01 < . 1:1(0) ack 241 win 257 diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt new file mode 100644 index 000000000000..310ef31518da --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify that setsockopt calls that force a route refresh do not +// cause problems matching SACKs with packets in the write queue. +// This variant tests IP_TOS. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_IP, IP_MTU_DISCOVER, [IP_PMTUDISC_DONT], 1) = 0 + +0...0.010 connect(3, ..., ...) = 0 + + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.01 < S. 0:0(0) ack 1 win 65535 <mss 1460,nop,wscale 2,nop,nop,sackOK> + +0 > . 1:1(0) ack 1 + + +.01 write(3, ..., 5840) = 5840 + +0 > P. 1:5841(5840) ack 1 + +.01 < . 1:1(0) ack 5841 win 65535 + + +.01 write(3, ..., 5840) = 5840 + +0 > P. 5841:11681(5840) ack 1 + +.01 < . 1:1(0) ack 11681 win 65535 + + +.01 write(3, ..., 14600) = 14600 + +0 > P. 11681:26281(14600) ack 1 + +// Try the socket option that we know can force a route refresh. + +0 setsockopt(3, SOL_IP, IP_TOS, [4], 1) = 0 +// Then revert to avoid routing/mangling/etc implications of that setting. + +0 setsockopt(3, SOL_IP, IP_TOS, [0], 1) = 0 + +// Verify that we do not retransmit the SACKed segments. + +.01 < . 1:1(0) ack 13141 win 65535 <sack 16061:17521 20441:26281,nop,nop> + +0 > . 13141:16061(2920) ack 1 + +0 > P. 17521:20441(2920) ack 1 + +.01 < . 1:1(0) ack 26281 win 65535 diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt new file mode 100644 index 000000000000..f185e1ac57ea --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests non-FACK SACK with SACKs coming in the order +// 2 6 8 3 9, to test what happens when we get a new SACKed range +// (for packet 3) that is on the right of an existing SACKed range +// (for packet 2). + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + + +.1 < . 1:1(0) ack 1 win 257 <sack 2001:3001,nop,nop> ++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001,nop,nop> ++.001 < . 1:1(0) ack 1 win 257 <sack 2001:3001 6001:7001 8001:9001,nop,nop> + +// 3 SACKed packets, so we enter Fast Recovery. + +0 > . 1:1001(1000) ack 1 + +0 %{ assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state }% + +0 %{ assert tcpi_lost == 6, tcpi_lost }% + +// SACK for 3001:4001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. ++.007 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:9001,nop,nop> + +0 > . 1001:2001(1000) ack 1 + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +0 %{ assert tcpi_reordering == 6, tcpi_reordering }% // 8001:9001 -> 3001:4001 is 6 + +// SACK for 9001:10001. + +.01 < . 1:1(0) ack 1 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop> + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +// ACK for 1:1001 as packets from t=0.303 arrive. ++.083 < . 1:1(0) ack 1001 win 257 <sack 2001:4001 6001:7001 8001:10001,nop,nop> + +0 %{ assert tcpi_lost == 4,tcpi_lost }% + +// ACK for 1:4001 as packets from t=0.310 arrive. ++.017 < . 1:1(0) ack 4001 win 257 <sack 6001:7001 8001:10001,nop,nop> + +0 %{ assert tcpi_lost == 3,tcpi_lost }% + +// ACK for 1:7001 as packets from t=0.320 arrive. + +.01 < . 1:1(0) ack 7001 win 257 <sack 8001:10001,nop,nop> + +// ACK for all data as packets from t=0.403 arrive. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt new file mode 100644 index 000000000000..0093b4973934 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests the case where we mark packets 0-4 lost, then +// get a SACK for 3, and then a SACK for 4. + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// SACK for 7001:8001. Using RACK we delay the fast retransmit. + +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop> +// RACK reordering timer ++.027 > . 1:1001(1000) ack 1 + +0 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_lost == 7, tcpi_lost # RACK thinks 1:7001 are lost +assert tcpi_reordering == 3, tcpi_reordering +}% + +// SACK for 3001:4001. ++.002 < . 1:1(0) ack 1 win 257 <sack 3001:4001 7001:8001,nop,nop> + +0 > . 1001:2001(1000) ack 1 + +0 %{ +assert tcpi_lost == 6, tcpi_lost # since 3001:4001 is no longer lost +assert tcpi_reordering == 5, tcpi_reordering # 7001:8001 -> 3001:4001 +}% + +// SACK for 4001:5001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. +// It uses the RFC3517 algorithm to mark 1:3001 lost +// because >=3 higher-sequence packets are SACKed. ++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:8001,nop,nop> + +0 > . 2001:3001(1000) ack 1 + +0 %{ +assert tcpi_lost == 5,tcpi_lost # SACK/RFC3517 thinks 1:3001 are lost +}% + +// SACK for 8001:9001. ++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:9001,nop,nop> + +// SACK for 9001:10001. ++.002 < . 1:1(0) ack 1 win 257 <sack 3001:5001 7001:10001,nop,nop> + +0 > . 5001:6001(1000) ack 1 + +// To simplify clean-up, say we get an ACK for all data. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt new file mode 100644 index 000000000000..980a832dc81c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests the case where we mark packets 0-4 lost, then +// get a SACK for 5, and then a SACK for 6. + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// SACK for 7001:8001. Using RACK we delay a fast retransmit. + +.1 < . 1:1(0) ack 1 win 257 <sack 7001:8001,nop,nop> ++.027 > . 1:1001(1000) ack 1 + +0 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_lost == 7,tcpi_lost # RACK thinks 1:7001 are lost +assert tcpi_reordering == 3, tcpi_reordering +}% + +// SACK for 5001:6001. + +0 < . 1:1(0) ack 1 win 257 <sack 5001:6001 7001:8001,nop,nop> + +0 > . 1001:2001(1000) ack 1 + +0 %{ +assert tcpi_lost == 6, tcpi_lost +assert tcpi_reordering == 3, tcpi_reordering # 7001:8001 -> 5001:6001 is 3 +}% + +// SACK for 6001:7001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. + +0 < . 1:1(0) ack 1 win 257 <sack 5001:8001,nop,nop> + +0 > . 2001:3001(1000) ack 1 + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +// SACK for 8001:9001. + +0 < . 1:1(0) ack 1 win 257 <sack 5001:9001,nop,nop> + +0 > . 3001:4001(1000) ack 1 + +// SACK for 9001:10001. + +0 < . 1:1(0) ack 1 win 257 <sack 5001:10001,nop,nop> + +0 > . 4001:5001(1000) ack 1 + +// To simplify clean-up, say we get an ACK for all data. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt new file mode 100644 index 000000000000..6740859a1360 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +// Simplest possible test of open() and then sendfile(). +// We write some zeroes into a file (since packetdrill expects payloads +// to be all zeroes) and then open() the file, then use sendfile() +// and verify that the correct number of zeroes goes out. + +`./defaults.sh +/bin/rm -f /tmp/testfile +/bin/dd bs=1 count=5 if=/dev/zero of=/tmp/testfile status=none +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + + +0 open("/tmp/testfile", O_RDONLY) = 5 + +0 sendfile(4, 5, [0], 5) = 5 + +0 > P. 1:6(5) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt new file mode 100644 index 000000000000..0cbd43253236 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh` + +// Initialize a server socket + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_IP, IP_FREEBIND, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Connection should get accepted + +0 < S 0:0(0) win 32972 <mss 1460,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <...> + +0 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + + +0 pipe([5, 6]) = 0 + +0 < U. 1:101(100) ack 1 win 257 urg 100 + +0 splice(4, NULL, 6, NULL, 99, 0) = 99 + +0 splice(4, NULL, 6, NULL, 1, 0) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt new file mode 100644 index 000000000000..8940726a3ec2 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP fastopen behavior with NULL as buffer pointer, but a non-zero +// buffer length. +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0` + +// Cache warmup: send a Fast Open cookie request + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) ++0 > S 0:0(0) <mss 1460,nop,nop,sackOK,nop,wscale 8,FO,nop,nop> ++0 < S. 123:123(0) ack 1 win 14600 <mss 1460,nop,nop,sackOK,nop,wscale 6,FO abcd1234,nop,nop> ++0 > . 1:1(0) ack 1 ++0 close(3) = 0 ++0 > F. 1:1(0) ack 1 ++0 < F. 1:1(0) ack 2 win 92 ++0 > . 2:2(0) ack 2 + +// Test with MSG_FASTOPEN without TCP_FASTOPEN_CONNECT. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 sendto(4, NULL, 1, MSG_FASTOPEN, ..., ...) = -1 ++0 close(4) = 0 + +// Test with TCP_FASTOPEN_CONNECT without MSG_FASTOPEN. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5 ++0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(5, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(5, ..., ...) = 0 ++0 sendto(5, NULL, 1, 0, ..., ...) = -1 ++0 close(5) = 0 + +// Test with both TCP_FASTOPEN_CONNECT and MSG_FASTOPEN. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 6 ++0 fcntl(6, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(6, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(6, ..., ...) = 0 ++0 sendto(6, NULL, 1, MSG_FASTOPEN, ..., ...) = -1 ++0 close(6) = 0 + +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt new file mode 100644 index 000000000000..b2b2cdf27e20 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we correctly skip zero-length IOVs. +`./defaults.sh` + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 40}, {..., 0}, {..., 20}], + msg_flags=0}, 0) = 60 + +0 > P. 1:61(60) ack 1 + +.01 < . 1:1(0) ack 61 win 257 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 0}, {..., 0}, {..., 0}], + msg_flags=0}, MSG_ZEROCOPY) = 0 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 10}, {..., 0}, {..., 50}], + msg_flags=0}, MSG_ZEROCOPY) = 60 + +0 > P. 61:121(60) ack 1 + +.01 < . 1:1(0) ack 121 win 257 diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt new file mode 100644 index 000000000000..59f5903f285c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test kernel behavior with NULL as buffer pointer + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.2 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + + +0 write(4, NULL, 1000) = -1 EFAULT (Bad address) + +0 send(4, NULL, 1000, 0) = -1 EFAULT (Bad address) + +0 sendto(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address) + + +0 < . 1:1001(1000) ack 1 win 200 + +0 read(4, NULL, 1000) = -1 EFAULT (Bad address) + +0 recv(4, NULL, 1000, 0) = -1 EFAULT (Bad address) + +0 recvfrom(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address) diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt new file mode 100644 index 000000000000..d7fdb43a8e89 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tcpi_last_data_recv for active session +`./defaults.sh` + +// Create a socket and set it to non-blocking. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + ++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) ++0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.030 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8> ++0 > . 1:1(0) ack 1 + ++1 %{ assert 990 <= tcpi_last_data_recv <= 1010, tcpi_last_data_recv }% + ++0 < . 1:1001(1000) ack 1 win 300 ++0 > . 1:1(0) ack 1001 + ++0 %{ assert tcpi_last_data_recv <= 10, tcpi_last_data_recv }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt new file mode 100644 index 000000000000..a9bcd46f6cb6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test rwnd limited time in tcp_info for client side. + +`./defaults.sh` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +// Server advertises 0 receive window. + +.01 < S. 0:0(0) ack 1 win 0 <mss 1000,nop,nop,sackOK> + + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + +// Make sure that initial rwnd limited time is 0. + +0 %{ assert tcpi_rwnd_limited == 0, tcpi_rwnd_limited }% + +// Receive window limited time starts here. + +0 write(3, ..., 1000) = 1000 + +// Check that rwnd limited time in tcp_info is around 0.1s. + +.1 %{ assert 98000 <= tcpi_rwnd_limited <= 110000, tcpi_rwnd_limited }% + +// Server opens the receive window. + +.1 < . 1:1(0) ack 1 win 2000 + +// Check that rwnd limited time in tcp_info is around 0.2s. + +0 %{ assert 198000 <= tcpi_rwnd_limited <= 210000, tcpi_rwnd_limited }% + + +0 > P. 1:1001(1000) ack 1 + +// Server advertises a very small receive window. + +.03 < . 1:1(0) ack 1001 win 10 + +// Receive window limited time starts again. + +0 write(3, ..., 1000) = 1000 + +// Server opens the receive window again. + +.1 < . 1:1(0) ack 1001 win 2000 +// Check that rwnd limited time in tcp_info is around 0.3s +// and busy time is 0.3 + 0.03 (server opened small window temporarily). + +0 %{ assert 298000 <= tcpi_rwnd_limited <= 310000, tcpi_rwnd_limited;\ + assert 328000 <= tcpi_busy_time <= 340000, tcpi_busy_time;\ +}% + + +0 > P. 1001:2001(1000) ack 1 + +.02 < . 1:1(0) ack 2001 win 2000 + +0 %{ assert 348000 <= tcpi_busy_time <= 360000, tcpi_busy_time }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt new file mode 100644 index 000000000000..f0de2acd0f8e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test send-buffer-limited time in tcp_info for client side. +`./defaults.sh` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.01 < S. 0:0(0) ack 1 win 10000 <mss 1000,sackOK,nop,nop,nop,wscale 8> + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [10000], 4) = 0 + +0 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [20000], [4]) = 0 + + +.09...0.14 write(3, ..., 150000) = 150000 + + +.01 < . 1:1(0) ack 10001 win 10000 + + +.01 < . 1:1(0) ack 30001 win 10000 + +// cwnd goes from 40(60KB) to 80(120KB), and that we hit the tiny sndbuf limit 10KB + +.01 < . 1:1(0) ack 70001 win 10000 + + +.02 < . 1:1(0) ack 95001 win 10000 + +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited; \ + assert 49000 <= tcpi_busy_time <= 52000, tcpi_busy_time; \ + assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }% + +// This ack frees up enough buffer so we are no longer +// buffer limited (socket flag SOCK_NOSPACE is cleared) + +.02 < . 1:1(0) ack 150001 win 10000 + +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited;\ + assert 69000 <= tcpi_busy_time <= 73000, tcpi_busy_time;\ + assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt new file mode 100644 index 000000000000..2087ec0c746a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that tx timestamping sends timestamps only for +// the last byte of each sendmsg. +`./defaults.sh +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.01 < S. 0:0(0) ack 1 win 20000 <mss 1000,nop,nop,sackOK> + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + + +0 write(3, ..., 11000) = 11000 + +0 > P. 1:10001(10000) ack 1 + +.01 < . 1:1(0) ack 10001 win 4000 + +0 > P. 10001:11001(1000) ack 1 + +.01 < . 1:1(0) ack 11001 win 4000 + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the last byte should be received almost immediately +// once 10001 is acked at t=20ms. +// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...) +// is called after when SYN is acked. So, we expect the last byte of the first +// chunk to have a timestamp key of 10999 (i.e., 11000 - 1). + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the last byte should be received almost immediately +// once 10001 is acked at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the last byte should be received at t=30ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt new file mode 100644 index 000000000000..876024a31110 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tx timestamping for partial writes (IPv4). +`./defaults.sh +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + +.01 < S. 0:0(0) ack 1 win 2000 <mss 1000,sackOK,TS val 700 ecr 100,nop,wscale 7> + +0 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700> + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1000], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// We have a partial write. + +0 write(3, ..., 10000) = 2964 + +0 > . 1:989(988) ack 1 <nop,nop,TS val 110 ecr 700> + +0 > P. 989:1977(988) ack 1 <nop,nop,TS val 110 ecr 700> + +.01 < . 1:1(0) ack 1977 win 92 <nop,nop,TS val 800 ecr 200> + +0 > P. 1977:2965(988) ack 1 <nop,nop,TS val 114 ecr 800> + +.01 < . 1:1(0) ack 2965 win 92 <nop,nop,TS val 800 ecr 200> + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately +// after the first ack at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the first chunk should be received almost immediately +// after the first ack at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the first chunk should be received after the last ack at +// t=30ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt new file mode 100644 index 000000000000..84d94780e6be --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tx timestamping for server-side (IPv4). +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// Write two 2KB chunks. +// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...) +// is called after when SYN is acked. So, we expect the last byte of the first +// and the second chunks to have timestamp keys of 1999 (i.e., 2000 - 1) and +// 3999 (i.e., 4000 - 1) respectively. + +0 write(4, ..., 2000) = 2000 + +0 write(4, ..., 2000) = 2000 + +0 > P. 1:2001(2000) ack 1 + +0 > P. 2001:4001(2000) ack 1 + +.01 < . 1:1(0) ack 2001 win 514 + +.01 < . 1:1(0) ack 4001 win 514 + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately +// after write at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the first chunk should be received almost immediately +// after write at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SCHED for the second chunk should be received almost immediately +// after that at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the second chunk should be received almost immediately +// after that at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the first chunk should be received at t=20ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the second chunk should be received at t=30ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt new file mode 100644 index 000000000000..e61424a7bd0a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we send FIN packet with correct TSval +--tcp_ts_tick_usecs=1000 +--tolerance_usecs=7000 + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0> + +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100> + +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100> + +0 accept(3, ..., ...) = 4 + + +1 close(4) = 0 +// Check that FIN TSval is updated properly, one second has passed since last sent packet. + +0 > F. 1:1(0) ack 1 <nop,nop,TS val 1200 ecr 200> diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt new file mode 100644 index 000000000000..174ce9a1bfc0 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we reject TS val updates on a packet with invalid ACK sequence + +`./defaults.sh +` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +.1 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0> + +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100> + +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100> + +0 accept(3, ..., ...) = 4 + +// bad packet with high tsval (its ACK sequence is above our sndnxt) + +0 < F. 1:1(0) ack 9999 win 20000 <nop,nop,TS val 200000 ecr 100> + + + +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100> + +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201> diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt new file mode 100644 index 000000000000..2e3b3bb7493a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we send RST packet with correct TSval +--tcp_ts_tick_usecs=1000 + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 <mss 1000,sackOK,TS val 100 ecr 0> + +0 > S. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 100> + +.1 < . 1:1(0) ack 1 win 20000 <nop,nop,TS val 200 ecr 100> + +0 accept(3, ..., ...) = 4 + + +0 < . 1:1001(1000) ack 1 win 20000 <nop,nop,TS val 201 ecr 100> + +0 > . 1:1(0) ack 1001 <nop,nop,TS val 200 ecr 201> + + +1 close(4) = 0 +// Check that RST TSval is updated properly, one second has passed since last sent packet. + +0 > R. 1:1(0) ack 1001 <nop,nop,TS val 1200 ecr 201> diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt new file mode 100644 index 000000000000..183051ba0cae --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + + +0 < S 0:0(0) win 0 <mss 1460> + +0 > S. 0:0(0) ack 1 <mss 1460> + + +.1 < . 1:1(0) ack 1 win 65530 + +0 accept(3, ..., ...) = 4 + + +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 + +0 write(4, ..., 24) = 24 + +0 > P. 1:25(24) ack 1 + +.1 < . 1:1(0) ack 25 win 65530 + +0 %{ assert tcpi_probes == 0, tcpi_probes; \ + assert tcpi_backoff == 0, tcpi_backoff }% + +// install a qdisc dropping all packets + +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0` + +0 write(4, ..., 24) = 24 + // When qdisc is congested we retry every 500ms + // (TCP_RESOURCE_PROBE_INTERVAL) and therefore + // we retry 6 times before hitting 3s timeout. + // First verify that the connection is alive: ++3.250 write(4, ..., 24) = 24 + // Now verify that shortly after that the socket is dead: + +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) + + +0 %{ assert tcpi_probes == 6, tcpi_probes; \ + assert tcpi_backoff == 0, tcpi_backoff }% + +0 close(4) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt new file mode 100644 index 000000000000..2efe02bfba9c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + +.1 < . 1:1(0) ack 1 win 32792 + + + +0 accept(3, ..., ...) = 4 + +// Okay, we received nothing, and decide to close this idle socket. +// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth +// trying hard to cleanly close this flow, at the price of keeping +// a TCP structure in kernel for about 1 minute ! + +2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 + +0 close(4) = 0 + + +0 > F. 1:1(0) ack 1 + +.3~+.400 > F. 1:1(0) ack 1 + +.3~+.400 > F. 1:1(0) ack 1 + +.6~+.800 > F. 1:1(0) ack 1 + +// We finally receive something from the peer, but it is way too late +// Our socket vanished because TCP_USER_TIMEOUT was really small + +0 < . 1:2(1) ack 1 win 32792 + +0 > R 1:1(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt new file mode 100644 index 000000000000..8bd60226ccfc --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify that established connections drop a segment without the ACK flag set. + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 <mss 1000,sackOK,nop,nop> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK> + +.01 < . 1:1(0) ack 1 win 20000 + +0 accept(3, ..., ...) = 4 + +// Receive a segment with no flags set, verify that it's not enqueued. + +.01 < - 1:1001(1000) win 20000 + +0 ioctl(4, SIOCINQ, [0]) = 0 + +// Receive a segment with ACK flag set, verify that it is enqueued. + +.01 < . 1:1001(1000) ack 1 win 20000 + +0 ioctl(4, SIOCINQ, [1000]) = 0 diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 1a706d03bb6b..9a85f93c33d8 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -44,9 +44,11 @@ struct tls_crypto_info_keys { }; static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type, - struct tls_crypto_info_keys *tls12) + struct tls_crypto_info_keys *tls12, + char key_generation) { - memset(tls12, 0, sizeof(*tls12)); + memset(tls12, key_generation, sizeof(*tls12)); + memset(tls12, 0, sizeof(struct tls_crypto_info)); switch (cipher_type) { case TLS_CIPHER_CHACHA20_POLY1305: @@ -275,7 +277,7 @@ TEST_F(tls_basic, recseq_wrap) if (self->notls) SKIP(return, "no TLS support"); - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0); memset(&tls12.aes128.rec_seq, 0xff, sizeof(tls12.aes128.rec_seq)); ASSERT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); @@ -391,7 +393,7 @@ FIXTURE_SETUP(tls) SKIP(return, "Unsupported cipher in FIPS mode"); tls_crypto_info_init(variant->tls_version, variant->cipher_type, - &tls12); + &tls12, 0); ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); @@ -1175,7 +1177,7 @@ TEST_F(tls, bidir) struct tls_crypto_info_keys tls12; tls_crypto_info_init(variant->tls_version, variant->cipher_type, - &tls12); + &tls12, 0); ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, tls12.len); @@ -1614,7 +1616,7 @@ TEST_F(tls, getsockopt) EXPECT_EQ(get.crypto_info.cipher_type, variant->cipher_type); /* get the full crypto_info */ - tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect, 0); len = expect.len; memrnd(&get, sizeof(get)); EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &get, &len), 0); @@ -1668,6 +1670,464 @@ TEST_F(tls, recv_efault) EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0); } +#define TLS_RECORD_TYPE_HANDSHAKE 0x16 +/* key_update, length 1, update_not_requested */ +static const char key_update_msg[] = "\x18\x00\x00\x01\x00"; +static void tls_send_keyupdate(struct __test_metadata *_metadata, int fd) +{ + size_t len = sizeof(key_update_msg); + + EXPECT_EQ(tls_send_cmsg(fd, TLS_RECORD_TYPE_HANDSHAKE, + (char *)key_update_msg, len, 0), + len); +} + +static void tls_recv_keyupdate(struct __test_metadata *_metadata, int fd, int flags) +{ + char buf[100]; + + EXPECT_EQ(tls_recv_cmsg(_metadata, fd, TLS_RECORD_TYPE_HANDSHAKE, buf, sizeof(buf), flags), + sizeof(key_update_msg)); + EXPECT_EQ(memcmp(buf, key_update_msg, sizeof(key_update_msg)), 0); +} + +/* set the key to 0 then 1 for RX, immediately to 1 for TX */ +TEST_F(tls_basic, rekey_rx) +{ + struct tls_crypto_info_keys tls12_0, tls12_1; + char const *test_str = "test_message"; + int send_len = strlen(test_str) + 1; + char buf[20]; + int ret; + + if (self->notls) + return; + + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_0, 0); + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_1, 1); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_0, tls12_0.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len); + EXPECT_EQ(ret, 0); + + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str, send_len), 0); +} + +/* set the key to 0 then 1 for TX, immediately to 1 for RX */ +TEST_F(tls_basic, rekey_tx) +{ + struct tls_crypto_info_keys tls12_0, tls12_1; + char const *test_str = "test_message"; + int send_len = strlen(test_str) + 1; + char buf[20]; + int ret; + + if (self->notls) + return; + + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_0, 0); + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_1, 1); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_0, tls12_0.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len); + EXPECT_EQ(ret, 0); + + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str, send_len), 0); +} + +TEST_F(tls, rekey) +{ + char const *test_str_1 = "test_message_before_rekey"; + char const *test_str_2 = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* initial send/recv */ + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* send after rekey */ + send_len = strlen(test_str_2) + 1; + EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* recv blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* recv non-blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + /* recv after rekey */ + EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1); + EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0); +} + +TEST_F(tls, rekey_fail) +{ + char const *test_str_1 = "test_message_before_rekey"; + char const *test_str_2 = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + /* initial send/recv */ + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + + if (variant->tls_version != TLS_1_3_VERSION) { + /* just check that rekey is not supported and return */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EBUSY); + return; + } + + /* successful update */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* invalid update: change of version */ + tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* invalid update (RX socket): change of version */ + tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* invalid update: change of cipher */ + if (variant->cipher_type == TLS_CIPHER_AES_GCM_256) + tls_crypto_info_init(variant->tls_version, TLS_CIPHER_CHACHA20_POLY1305, &tls12, 1); + else + tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_256, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* send after rekey, the invalid updates shouldn't have an effect */ + send_len = strlen(test_str_2) + 1; + EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* recv blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* recv non-blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + /* recv after rekey */ + EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1); + EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0); +} + +TEST_F(tls, rekey_peek) +{ + char const *test_str_1 = "test_message_before_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_PEEK), -1); + + /* peek KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); +} + +TEST_F(tls, splice_rekey) +{ + int send_len = TLS_PAYLOAD_MAX_LEN / 2; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + struct tls_crypto_info_keys tls12; + int p[2]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + memrnd(mem_send, sizeof(mem_send)); + + ASSERT_GE(pipe(p), 0); + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); + + /* can't splice the KeyUpdate */ + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1); + EXPECT_EQ(errno, EINVAL); + + /* peek KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* can't splice before updating the key */ + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); +} + +TEST_F(tls, rekey_peek_splice) +{ + char const *test_str_1 = "test_message_before_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + int p[2]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + ASSERT_GE(pipe(p), 0); + + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_recv, test_str_1, send_len), 0); +} + +TEST_F(tls, rekey_getsockopt) +{ + struct tls_crypto_info_keys tls12; + struct tls_crypto_info_keys tls12_get; + socklen_t len; + + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + tls_recv_keyupdate(_metadata, self->cfd, 0); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); +} + +TEST_F(tls, rekey_poll_pending) +{ + char const *test_str = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + struct pollfd pfd = { }; + int send_len; + int ret; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* send immediately after rekey */ + send_len = strlen(test_str) + 1; + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + + /* key hasn't been updated, expect cfd to be non-readable */ + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 0), 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret) { + int pid2, status; + + /* wait before installing the new key */ + sleep(1); + + /* update RX key while poll() is sleeping */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + pid2 = wait(&status); + EXPECT_EQ(pid2, ret); + EXPECT_EQ(status, 0); + } else { + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 5000), 1); + + exit(!__test_passed(_metadata)); + } +} + +TEST_F(tls, rekey_poll_delay) +{ + char const *test_str = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + struct pollfd pfd = { }; + int send_len; + int ret; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret) { + int pid2, status; + + /* wait before installing the new key */ + sleep(1); + + /* update RX key while poll() is sleeping */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + sleep(1); + send_len = strlen(test_str) + 1; + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + + pid2 = wait(&status); + EXPECT_EQ(pid2, ret); + EXPECT_EQ(status, 0); + } else { + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 5000), 1); + exit(!__test_passed(_metadata)); + } +} + FIXTURE(tls_err) { int fd, cfd; @@ -1696,7 +2156,7 @@ FIXTURE_SETUP(tls_err) int ret; tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_128, - &tls12); + &tls12, 0); ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); ulp_sock_pair(_metadata, &self->fd2, &self->cfd2, &self->notls); @@ -2118,7 +2578,7 @@ TEST(tls_v6ops) { int sfd, ret, fd; socklen_t len, len2; - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0); addr.sin6_family = AF_INET6; addr.sin6_addr = in6addr_any; @@ -2177,7 +2637,7 @@ TEST(prequeue) { len = sizeof(addr); memrnd(buf, sizeof(buf)); - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12, 0); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index 640bc43452fa..88fa1d53ba2b 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -92,6 +92,9 @@ run_udp() { echo "udp" run_in_netns ${args} + echo "udp sendmmsg" + run_in_netns ${args} -m + echo "udp gso" run_in_netns ${args} -S 0 diff --git a/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py new file mode 100755 index 000000000000..0f44a6199495 --- /dev/null +++ b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Script that checks that SFQ rejects a limit of 1 at the kernel +# level. We can't use iproute2's tc because it does not accept a limit +# of 1. + +import sys +import os + +from pyroute2 import IPRoute +from pyroute2.netlink.exceptions import NetlinkError + +ip = IPRoute() +ifidx = ip.link_lookup(ifname=sys.argv[1]) + +try: + ip.tc('add', 'sfq', ifidx, limit=1) + sys.exit(1) +except NetlinkError: + sys.exit(0) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json index 16d51936b385..50e8d72781cb 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json @@ -208,5 +208,25 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "4d6f", + "name": "Check that limit of 1 is rejected", + "category": [ + "qdisc", + "sfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + ], + "cmdUnderTest": "./scripts/sfq_rejects_limit_1.py $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "sfq", + "matchCount": "0", + "teardown": [ + ] } ] |