diff options
38 files changed, 1025 insertions, 568 deletions
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index b4a56e61631a..578bbec1ce18 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2116,7 +2116,7 @@ static void qede_vlan_mark_nonconfigured(struct qede_dev *edev) edev->accept_any_vlan = false; } -int qede_set_features(struct net_device *dev, netdev_features_t features) +static int qede_set_features(struct net_device *dev, netdev_features_t features) { struct qede_dev *edev = netdev_priv(dev); netdev_features_t changes = features ^ dev->features; diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index f38696ceee74..908e72e18ef7 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -1724,24 +1724,21 @@ static void velocity_free_tx_buf(struct velocity_info *vptr, struct velocity_td_info *tdinfo, struct tx_desc *td) { struct sk_buff *skb = tdinfo->skb; + int i; /* * Don't unmap the pre-allocated tx_bufs */ - if (tdinfo->skb_dma) { - int i; + for (i = 0; i < tdinfo->nskb_dma; i++) { + size_t pktlen = max_t(size_t, skb->len, ETH_ZLEN); - for (i = 0; i < tdinfo->nskb_dma; i++) { - size_t pktlen = max_t(size_t, skb->len, ETH_ZLEN); + /* For scatter-gather */ + if (skb_shinfo(skb)->nr_frags > 0) + pktlen = max_t(size_t, pktlen, + td->td_buf[i].size & ~TD_QUEUE); - /* For scatter-gather */ - if (skb_shinfo(skb)->nr_frags > 0) - pktlen = max_t(size_t, pktlen, - td->td_buf[i].size & ~TD_QUEUE); - - dma_unmap_single(vptr->dev, tdinfo->skb_dma[i], - le16_to_cpu(pktlen), DMA_TO_DEVICE); - } + dma_unmap_single(vptr->dev, tdinfo->skb_dma[i], + le16_to_cpu(pktlen), DMA_TO_DEVICE); } dev_kfree_skb_irq(skb); tdinfo->skb = NULL; diff --git a/drivers/net/ethernet/xilinx/Kconfig b/drivers/net/ethernet/xilinx/Kconfig index 4f5c024c6192..6d68c8a8f4f2 100644 --- a/drivers/net/ethernet/xilinx/Kconfig +++ b/drivers/net/ethernet/xilinx/Kconfig @@ -5,7 +5,7 @@ config NET_VENDOR_XILINX bool "Xilinx devices" default y - depends on PPC || PPC32 || MICROBLAZE || ARCH_ZYNQ + depends on PPC || PPC32 || MICROBLAZE || ARCH_ZYNQ || MIPS ---help--- If you have a network (Ethernet) card belonging to this class, say Y. @@ -18,7 +18,7 @@ if NET_VENDOR_XILINX config XILINX_EMACLITE tristate "Xilinx 10/100 Ethernet Lite support" - depends on (PPC32 || MICROBLAZE || ARCH_ZYNQ) + depends on PPC32 || MICROBLAZE || ARCH_ZYNQ || MIPS select PHYLIB ---help--- This driver supports the 10/100 Ethernet Lite from Xilinx. diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 351e701eb043..3ea47f28e143 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2973,6 +2973,7 @@ static void macsec_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &macsec_netdev_ops; dev->destructor = macsec_free_netdev; + SET_NETDEV_DEVTYPE(dev, &macsec_type); eth_zero_addr(dev->broadcast); } diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 1c3e07c3d0b8..87b566f54cc1 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -274,9 +274,9 @@ config MICROCHIP_PHY Supports the LAN88XX PHYs. config MICROSEMI_PHY - tristate "Microsemi PHYs" - ---help--- - Currently supports the VSC8531 and VSC8541 PHYs + tristate "Microsemi PHYs" + ---help--- + Currently supports the VSC8531 and VSC8541 PHYs config NATIONAL_PHY tristate "National Semiconductor PHYs" diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c index ad33390b382a..c09cc4a3d166 100644 --- a/drivers/net/phy/mscc.c +++ b/drivers/net/phy/mscc.c @@ -13,135 +13,135 @@ #include <linux/phy.h> enum rgmii_rx_clock_delay { - RGMII_RX_CLK_DELAY_0_2_NS = 0, - RGMII_RX_CLK_DELAY_0_8_NS = 1, - RGMII_RX_CLK_DELAY_1_1_NS = 2, - RGMII_RX_CLK_DELAY_1_7_NS = 3, - RGMII_RX_CLK_DELAY_2_0_NS = 4, - RGMII_RX_CLK_DELAY_2_3_NS = 5, - RGMII_RX_CLK_DELAY_2_6_NS = 6, - RGMII_RX_CLK_DELAY_3_4_NS = 7 + RGMII_RX_CLK_DELAY_0_2_NS = 0, + RGMII_RX_CLK_DELAY_0_8_NS = 1, + RGMII_RX_CLK_DELAY_1_1_NS = 2, + RGMII_RX_CLK_DELAY_1_7_NS = 3, + RGMII_RX_CLK_DELAY_2_0_NS = 4, + RGMII_RX_CLK_DELAY_2_3_NS = 5, + RGMII_RX_CLK_DELAY_2_6_NS = 6, + RGMII_RX_CLK_DELAY_3_4_NS = 7 }; -#define MII_VSC85XX_INT_MASK 25 -#define MII_VSC85XX_INT_MASK_MASK 0xa000 -#define MII_VSC85XX_INT_STATUS 26 +#define MII_VSC85XX_INT_MASK 25 +#define MII_VSC85XX_INT_MASK_MASK 0xa000 +#define MII_VSC85XX_INT_STATUS 26 -#define MSCC_EXT_PAGE_ACCESS 31 -#define MSCC_PHY_PAGE_STANDARD 0x0000 /* Standard registers */ -#define MSCC_PHY_PAGE_EXTENDED_2 0x0002 /* Extended reg - page 2 */ +#define MSCC_EXT_PAGE_ACCESS 31 +#define MSCC_PHY_PAGE_STANDARD 0x0000 /* Standard registers */ +#define MSCC_PHY_PAGE_EXTENDED_2 0x0002 /* Extended reg - page 2 */ /* Extended Page 2 Registers */ -#define MSCC_PHY_RGMII_CNTL 20 -#define RGMII_RX_CLK_DELAY_MASK 0x0070 -#define RGMII_RX_CLK_DELAY_POS 4 +#define MSCC_PHY_RGMII_CNTL 20 +#define RGMII_RX_CLK_DELAY_MASK 0x0070 +#define RGMII_RX_CLK_DELAY_POS 4 /* Microsemi PHY ID's */ -#define PHY_ID_VSC8531 0x00070570 -#define PHY_ID_VSC8541 0x00070770 +#define PHY_ID_VSC8531 0x00070570 +#define PHY_ID_VSC8541 0x00070770 static int vsc85xx_phy_page_set(struct phy_device *phydev, u8 page) { - int rc; + int rc; - rc = phy_write(phydev, MSCC_EXT_PAGE_ACCESS, page); - return rc; + rc = phy_write(phydev, MSCC_EXT_PAGE_ACCESS, page); + return rc; } static int vsc85xx_default_config(struct phy_device *phydev) { - int rc; - u16 reg_val; + int rc; + u16 reg_val; - mutex_lock(&phydev->lock); - rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2); - if (rc != 0) - goto out_unlock; + mutex_lock(&phydev->lock); + rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED_2); + if (rc != 0) + goto out_unlock; - reg_val = phy_read(phydev, MSCC_PHY_RGMII_CNTL); - reg_val &= ~(RGMII_RX_CLK_DELAY_MASK); - reg_val |= (RGMII_RX_CLK_DELAY_1_1_NS << RGMII_RX_CLK_DELAY_POS); - phy_write(phydev, MSCC_PHY_RGMII_CNTL, reg_val); - rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD); + reg_val = phy_read(phydev, MSCC_PHY_RGMII_CNTL); + reg_val &= ~(RGMII_RX_CLK_DELAY_MASK); + reg_val |= (RGMII_RX_CLK_DELAY_1_1_NS << RGMII_RX_CLK_DELAY_POS); + phy_write(phydev, MSCC_PHY_RGMII_CNTL, reg_val); + rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD); out_unlock: - mutex_unlock(&phydev->lock); + mutex_unlock(&phydev->lock); - return rc; + return rc; } static int vsc85xx_config_init(struct phy_device *phydev) { - int rc; + int rc; - rc = vsc85xx_default_config(phydev); - if (rc) - return rc; - rc = genphy_config_init(phydev); + rc = vsc85xx_default_config(phydev); + if (rc) + return rc; + rc = genphy_config_init(phydev); - return rc; + return rc; } static int vsc85xx_ack_interrupt(struct phy_device *phydev) { - int rc = 0; + int rc = 0; - if (phydev->interrupts == PHY_INTERRUPT_ENABLED) - rc = phy_read(phydev, MII_VSC85XX_INT_STATUS); + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) + rc = phy_read(phydev, MII_VSC85XX_INT_STATUS); - return (rc < 0) ? rc : 0; + return (rc < 0) ? rc : 0; } static int vsc85xx_config_intr(struct phy_device *phydev) { - int rc; - - if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { - rc = phy_write(phydev, MII_VSC85XX_INT_MASK, - MII_VSC85XX_INT_MASK_MASK); - } else { - rc = phy_write(phydev, MII_VSC85XX_INT_MASK, 0); - if (rc < 0) - return rc; - rc = phy_read(phydev, MII_VSC85XX_INT_STATUS); - } - - return rc; + int rc; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + rc = phy_write(phydev, MII_VSC85XX_INT_MASK, + MII_VSC85XX_INT_MASK_MASK); + } else { + rc = phy_write(phydev, MII_VSC85XX_INT_MASK, 0); + if (rc < 0) + return rc; + rc = phy_read(phydev, MII_VSC85XX_INT_STATUS); + } + + return rc; } /* Microsemi VSC85xx PHYs */ static struct phy_driver vsc85xx_driver[] = { { - .phy_id = PHY_ID_VSC8531, - .name = "Microsemi VSC8531", - .phy_id_mask = 0xfffffff0, - .features = PHY_GBIT_FEATURES, - .flags = PHY_HAS_INTERRUPT, - .soft_reset = &genphy_soft_reset, - .config_init = &vsc85xx_config_init, - .config_aneg = &genphy_config_aneg, - .aneg_done = &genphy_aneg_done, - .read_status = &genphy_read_status, - .ack_interrupt = &vsc85xx_ack_interrupt, - .config_intr = &vsc85xx_config_intr, - .suspend = &genphy_suspend, - .resume = &genphy_resume, + .phy_id = PHY_ID_VSC8531, + .name = "Microsemi VSC8531", + .phy_id_mask = 0xfffffff0, + .features = PHY_GBIT_FEATURES, + .flags = PHY_HAS_INTERRUPT, + .soft_reset = &genphy_soft_reset, + .config_init = &vsc85xx_config_init, + .config_aneg = &genphy_config_aneg, + .aneg_done = &genphy_aneg_done, + .read_status = &genphy_read_status, + .ack_interrupt = &vsc85xx_ack_interrupt, + .config_intr = &vsc85xx_config_intr, + .suspend = &genphy_suspend, + .resume = &genphy_resume, }, { - .phy_id = PHY_ID_VSC8541, - .name = "Microsemi VSC8541 SyncE", - .phy_id_mask = 0xfffffff0, - .features = PHY_GBIT_FEATURES, - .flags = PHY_HAS_INTERRUPT, - .soft_reset = &genphy_soft_reset, - .config_init = &vsc85xx_config_init, - .config_aneg = &genphy_config_aneg, - .aneg_done = &genphy_aneg_done, - .read_status = &genphy_read_status, - .ack_interrupt = &vsc85xx_ack_interrupt, - .config_intr = &vsc85xx_config_intr, - .suspend = &genphy_suspend, - .resume = &genphy_resume, + .phy_id = PHY_ID_VSC8541, + .name = "Microsemi VSC8541 SyncE", + .phy_id_mask = 0xfffffff0, + .features = PHY_GBIT_FEATURES, + .flags = PHY_HAS_INTERRUPT, + .soft_reset = &genphy_soft_reset, + .config_init = &vsc85xx_config_init, + .config_aneg = &genphy_config_aneg, + .aneg_done = &genphy_aneg_done, + .read_status = &genphy_read_status, + .ack_interrupt = &vsc85xx_ack_interrupt, + .config_intr = &vsc85xx_config_intr, + .suspend = &genphy_suspend, + .resume = &genphy_resume, } }; @@ -149,9 +149,9 @@ static struct phy_driver vsc85xx_driver[] = { module_phy_driver(vsc85xx_driver); static struct mdio_device_id __maybe_unused vsc85xx_tbl[] = { - { PHY_ID_VSC8531, 0xfffffff0, }, - { PHY_ID_VSC8541, 0xfffffff0, }, - { } + { PHY_ID_VSC8531, 0xfffffff0, }, + { PHY_ID_VSC8541, 0xfffffff0, }, + { } }; MODULE_DEVICE_TABLE(mdio, vsc85xx_tbl); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 49d4aef1f789..3319d97d789d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -272,6 +272,23 @@ static inline int vlan_get_encap_level(struct net_device *dev) } #endif +/** + * eth_type_vlan - check for valid vlan ether type. + * @ethertype: ether type to check + * + * Returns true if the ether type is a vlan ether type. + */ +static inline bool eth_type_vlan(__be16 ethertype) +{ + switch (ethertype) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -425,8 +442,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data; - if (veth->h_vlan_proto != htons(ETH_P_8021Q) && - veth->h_vlan_proto != htons(ETH_P_8021AD)) + if (!eth_type_vlan(veth->h_vlan_proto)) return -EINVAL; *vlan_tci = ntohs(veth->h_vlan_TCI); @@ -488,7 +504,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ - if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; @@ -506,8 +522,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; - } while (type == htons(ETH_P_8021Q) || - type == htons(ETH_P_8021AD)); + } while (eth_type_vlan(type)); } if (depth) @@ -572,8 +587,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && - likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + likely(!eth_type_vlan(skb->protocol))) return false; return true; @@ -593,15 +607,14 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; - if (likely(protocol != htons(ETH_P_8021Q) && - protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(protocol))) return false; veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } - if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) + if (!eth_type_vlan(protocol)) return false; return true; diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index feb04ea20f11..65da430e260f 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh); + const struct nlmsghdr *unlh, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, @@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns); + struct user_namespace *user_ns, bool net_admin); extern int inet_diag_register(const struct inet_diag_handler *handler); extern void inet_diag_unregister(const struct inet_diag_handler *handler); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cfb7219be665..4c5662f05bda 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +void skb_rbtree_purge(struct rb_root *root); + void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7be9b1242354..c723a465125d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,10 +281,9 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; - /* OOO segments go in this list. Note that socket lock must be held, - * as we do not use sk_buff_head lock. - */ - struct sk_buff_head out_of_order_queue; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 24cd3949a9a4..27bb9633c69d 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -11,7 +11,7 @@ struct ctl_table_header; struct xfrm_policy_hash { - struct hlist_head *table; + struct hlist_head __rcu *table; unsigned int hmask; u8 dbits4; u8 sbits4; @@ -38,14 +38,12 @@ struct netns_xfrm { * mode. Also, it can be used by ah/esp icmp error handler to find * offending SA. */ - struct hlist_head *state_bydst; - struct hlist_head *state_bysrc; - struct hlist_head *state_byspi; + struct hlist_head __rcu *state_bydst; + struct hlist_head __rcu *state_bysrc; + struct hlist_head __rcu *state_byspi; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; - struct hlist_head state_gc_list; - struct work_struct state_gc_work; struct list_head policy_all; struct hlist_head *policy_byidx; @@ -73,7 +71,7 @@ struct netns_xfrm { struct dst_ops xfrm6_dst_ops; #endif spinlock_t xfrm_state_lock; - rwlock_t xfrm_policy_lock; + spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; /* flow cache part */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d6ae36512429..fdfbedd61c67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (skb_queue_empty(&tp->out_of_order_queue) && + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index adfebd6f243c..d2fdd6d70959 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -187,7 +187,7 @@ struct xfrm_state { struct xfrm_replay_state_esn *preplay_esn; /* The functions for replay detection. */ - struct xfrm_replay *repl; + const struct xfrm_replay *repl; /* internal flag that only holds state for delayed aevent at the * moment diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5581206a08ae..b5c366f87b3e 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -123,6 +123,7 @@ enum { INET_DIAG_LOCALS, INET_DIAG_PEERS, INET_DIAG_PAD, + INET_DIAG_MARK, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 54c3b4f4aceb..59ed3992c760 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -605,13 +605,13 @@ struct ovs_action_push_mpls { * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set * (but it will not be set in the 802.1Q header that is pushed). * - * The @vlan_tpid value is typically %ETH_P_8021Q. The only acceptable TPID - * values are those that the kernel module also parses as 802.1Q headers, to - * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN - * from having surprising results. + * The @vlan_tpid value is typically %ETH_P_8021Q or %ETH_P_8021AD. + * The only acceptable TPID values are those that the kernel module also parses + * as 802.1Q or 802.1AD headers, to prevent %OVS_ACTION_ATTR_PUSH_VLAN followed + * by %OVS_ACTION_ATTR_POP_VLAN from having surprising results. */ struct ovs_action_push_vlan { - __be16 vlan_tpid; /* 802.1Q TPID. */ + __be16 vlan_tpid; /* 802.1Q or 802.1ad TPID. */ __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; @@ -721,9 +721,10 @@ enum ovs_nat_attr { * is copied from the value to the packet header field, rest of the bits are * left unchanged. The non-masked value bits must be passed in as zeroes. * Masking is not supported for the %OVS_KEY_ATTR_TUNNEL attribute. - * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the - * packet. - * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. + * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q or 802.1ad header + * onto the packet. + * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q or 802.1ad header + * from the packet. * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in * the nested %OVS_SAMPLE_ATTR_* attributes. * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48c2705db22c..90493a66dddd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1637,21 +1637,42 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct verifier_env *env, - struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct verifier_state *state, + const struct reg_state *dst_reg) { - struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs, *reg; int i; - /* r2 = r3; - * r2 += 8 - * if (r2 > pkt_end) goto somewhere - * r2 == dst_reg, pkt_end == src_reg, - * r2=pkt(id=n,off=8,r=0) - * r3=pkt(id=n,off=0,r=0) - * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access + + /* LLVM can generate two kind of checks: + * + * Type 1: + * + * r2 = r3; + * r2 += 8; + * if (r2 > pkt_end) goto <handle exception> + * <access okay> + * + * Where: + * r2 == dst_reg, pkt_end == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Type 2: + * + * r2 = r3; + * r2 += 8; + * if (pkt_end >= r2) goto <access okay> + * <handle exception> + * + * Where: + * pkt_end == dst_reg, r2 == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access. */ + for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) regs[i].range = dst_reg->off; @@ -1668,8 +1689,8 @@ static void find_good_pkt_pointers(struct verifier_env *env, static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; - struct verifier_state *other_branch; + struct verifier_state *other_branch, *this_branch = &env->cur_state; + struct reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1750,13 +1771,17 @@ static int check_cond_jmp_op(struct verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(env, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } if (log_level) - print_verifier_state(&env->cur_state); + print_verifier_state(this_branch); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1dfca1c3f8f5..937e459bdaa9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3669,10 +3669,6 @@ nla_put_failure: return -EMSGSIZE; } -static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { - [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, -}; - static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..1e329d411242 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2445,6 +2445,25 @@ void skb_queue_purge(struct sk_buff_head *list) EXPORT_SYMBOL(skb_queue_purge); /** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + +/** * skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e2ffc2a5c7db..241f27bbd7ad 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1081,7 +1081,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; - unsigned int nlflags = 0; + u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1126,6 +1126,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (cfg->fc_nlflags & NLM_F_EXCL) goto out; + nlflags &= ~NLM_F_EXCL; + /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes @@ -1151,6 +1153,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct fib_info *fi_drop; u8 state; + nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) @@ -1191,7 +1194,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); + tb->tb_id, &cfg->fc_nlinfo, nlflags); goto succeeded; } @@ -1203,7 +1206,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) - nlflags = NLM_F_APPEND; + nlflags |= NLM_F_APPEND; else fa = fa_first; } @@ -1211,6 +1214,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; + nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index abfbe492ebfe..e4d16fc5bbb3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void) int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns) + struct user_namespace *user_ns, + bool net_admin) { const struct inet_sock *inet = inet_sk(sk); @@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, } #endif + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + goto errout; + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; @@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { @@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, - user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, + portid, seq, nlmsg_flags, unlh, net_admin); } static int inet_twsk_diag_fill(struct sock *sk, @@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { + struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = inet_reqsk(sk)->num_retrans; + r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); @@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) + return -EMSGSIZE; + nlmsg_end(skb, nlh); return 0; } @@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, portid, seq, @@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - const struct nlattr *bc) + const struct nlattr *bc, + bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk, return inet_csk_diag_fill(sk, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, + net_admin); } static void twsk_build_assert(void) @@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, cb->args[3] > 0) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, + bc, net_admin) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -948,7 +965,7 @@ skip_listen_ht: sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh); + cb->nlh, net_admin); if (res < 0) { spin_unlock_bh(lock); goto done; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 71a52f4d4cff..af4919792b6a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->ttl = val; break; case IP_TOS: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) + val = *(int *)CMSG_DATA(cmsg); + else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) + val = *(u8 *)CMSG_DATA(cmsg); + else return -EINVAL; - val = *(int *)CMSG_DATA(cmsg); if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77311a92275c..a13fcb369f52 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); @@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8cd02c0b056c..a5934c4c8cd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk) /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); @@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_empty(&tp->out_of_order_queue)) { + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } @@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; + bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; - bool fragstolen, eaten; + struct rb_node *p; - while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); - __skb_unlink(skb, &tp->out_of_order_queue); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - tcp_fin(sk); - if (eaten) + else kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + tcp_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; + } } } @@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *q, *parent; struct sk_buff *skb1; u32 seq, end_seq; + bool fragstolen; tcp_ecn_check_ce(tp, skb); @@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tp->rcv_nxt, seq, end_seq); - skb1 = skb_peek_tail(&tp->out_of_order_queue); - if (!skb1) { + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = - TCP_SKB_CB(skb)->end_seq; + tp->selective_acks[0].start_seq = seq; + tp->selective_acks[0].end_seq = end_seq; } - __skb_queue_head(&tp->out_of_order_queue, skb); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; goto end; } - seq = TCP_SKB_CB(skb)->seq; - end_seq = TCP_SKB_CB(skb)->end_seq; - - if (seq == TCP_SKB_CB(skb1)->end_seq) { - bool fragstolen; - - if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - } else { - tcp_grow_window(sk, skb); - kfree_skb_partial(skb, fragstolen); - skb = NULL; + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto add_sack; + } + + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; } - - if (!tp->rx_opt.num_sacks || - tp->selective_acks[0].end_seq != seq) - goto add_sack; - - /* Common case: data arrive in order after hole. */ - tp->selective_acks[0].end_seq = end_seq; - goto end; - } - - /* Find place to insert this segment. */ - while (1) { - if (!after(TCP_SKB_CB(skb1)->seq, seq)) - break; - if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { - skb1 = NULL; - break; + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); + goto add_sack; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; } - skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + p = &parent->rb_right; } - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); - skb = NULL; - tcp_dsack_set(sk, seq, end_seq); - goto add_sack; - } - if (after(seq, TCP_SKB_CB(skb1)->seq)) { - /* Partial overlap. */ - tcp_dsack_set(sk, seq, - TCP_SKB_CB(skb1)->end_seq); - } else { - if (skb_queue_is_first(&tp->out_of_order_queue, - skb1)) - skb1 = NULL; - else - skb1 = skb_queue_prev( - &tp->out_of_order_queue, - skb1); - } - } - if (!skb1) - __skb_queue_head(&tp->out_of_order_queue, skb); - else - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); - /* And clean segments covered by new one as whole. */ - while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { - skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; @@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } - __skb_unlink(skb1, &tp->out_of_order_queue); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } + /* If there is no skb after us, we are the last_skb ! */ + if (!q) + tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) @@ -4651,13 +4664,13 @@ queue_and_out: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); - if (!skb_queue_empty(&tp->out_of_order_queue)) { + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } @@ -4711,48 +4724,76 @@ drop: tcp_data_queue_ofo(sk, skb); } +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) +{ + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; + + return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); +} + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *list) + struct sk_buff_head *list, + struct rb_root *root) { - struct sk_buff *next = NULL; + struct sk_buff *next = tcp_skb_next(skb, list); - if (!skb_queue_is_last(list, skb)) - next = skb_queue_next(list, skb); + if (list) + __skb_unlink(skb, list); + else + rb_erase(&skb->rbnode, root); - __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * - * If tail is NULL, this means until the end of the list. + * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff_head *list, - struct sk_buff *head, struct sk_buff *tail, - u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb, *n; + struct sk_buff *skb = head, *n; + struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find - * the point where collapsing can be useful. */ - skb = head; + * the point where collapsing can be useful. + */ restart: - end_of_skbs = true; - skb_queue_walk_from_safe(list, skb, n) { - if (skb == tail) - break; + for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { + n = tcp_skb_next(skb, list); + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; @@ -4770,13 +4811,10 @@ restart: break; } - if (!skb_queue_is_last(list, skb)) { - struct sk_buff *next = skb_queue_next(list, skb); - if (next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { - end_of_skbs = false; - break; - } + if (n && n != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { + end_of_skbs = false; + break; } /* Decided to skip this, advance start seq. */ @@ -4786,17 +4824,22 @@ restart: (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; + __skb_queue_head_init(&tmp); + while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) - return; + break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_queue_before(list, skb, nskb); + if (list) + __skb_queue_before(list, skb, nskb); + else + __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4814,14 +4857,17 @@ restart: start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) - return; + goto end; } } } +end: + skb_queue_walk_safe(&tmp, skb, n) + tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs @@ -4830,43 +4876,43 @@ restart: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); - struct sk_buff *head; + struct sk_buff *skb, *head; + struct rb_node *p; u32 start, end; - if (!skb) + p = rb_first(&tp->out_of_order_queue); + skb = rb_entry_safe(p, struct sk_buff, rbnode); +new_range: + if (!skb) { + p = rb_last(&tp->out_of_order_queue); + /* Note: This is possible p is NULL here. We do not + * use rb_entry_safe(), as ooo_last_skb is valid only + * if rbtree is not empty. + */ + tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); return; - + } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; - head = skb; - - for (;;) { - struct sk_buff *next = NULL; - if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) - next = skb_queue_next(&tp->out_of_order_queue, skb); - skb = next; + for (head = skb;;) { + skb = tcp_skb_next(skb, NULL); - /* Segment is terminated when we see gap or when - * we are at the end of all the queue. */ + /* Range is terminated when we see a gap or when + * we are at the queue end. + */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, &tp->out_of_order_queue, + tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); - head = skb; - if (!skb) - break; - /* Start new segment */ + goto new_range; + } + + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; - } else { - if (before(TCP_SKB_CB(skb)->seq, start)) - start = TCP_SKB_CB(skb)->seq; - if (after(TCP_SKB_CB(skb)->end_seq, end)) - end = TCP_SKB_CB(skb)->end_seq; - } } } @@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct rb_node *node, *prev; - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - - while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { - tcp_drop(sk, skb); + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); + tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; - } + node = prev; + } while (node); + tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) - tcp_collapse(sk, &sk->sk_receive_queue, + tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); @@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a75bf48d7950..04b989328558 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1845,7 +1845,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4b95ec4ed2c8..f63c73dc0acb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -488,7 +488,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd_cnt = 0; tcp_init_xmit_timers(newsk); - __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 58b79c0c0d69..9a89c10a55f0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -20,7 +20,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return inet_sk_diag_fill(sk, NULL, skb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, @@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, err = inet_sk_diag_fill(sk, NULL, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; @@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 771be1fa4176..ef5485204522 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + u16 nlflags = NLM_F_EXCL; int err; ins = &fn->leaf; @@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; + + nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; @@ -856,6 +859,7 @@ next_iter: pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: + nlflags |= NLM_F_CREATE; err = fib6_commit_metrics(&rt->dst, mxc); if (err) return err; @@ -864,7 +868,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ca91fc33f8a9..4fe9032b1160 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -246,20 +246,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) int err; err = skb_vlan_pop(skb); - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = 0; + } else { + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + } return err; } static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_vlan *vlan) { - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = vlan->vlan_tci; + } else { + key->eth.vlan.tci = vlan->vlan_tci; + key->eth.vlan.tpid = vlan->vlan_tpid; + } return skb_vlan_push(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 0ea128eeeab2..1240ae3b88d2 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -302,24 +302,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +/** + * Parse vlan tag from vlan header. + * Returns ERROR on memory error. + * Returns 0 if it encounters a non-vlan or incomplete packet. + * Returns 1 after successfully parsing vlan tag. + */ +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) { - struct qtag_prefix { - __be16 eth_type; /* ETH_P_8021Q */ - __be16 tci; - }; - struct qtag_prefix *qp; + struct vlan_head *vh = (struct vlan_head *)skb->data; - if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + if (likely(!eth_type_vlan(vh->tpid))) return 0; - if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + - sizeof(__be16)))) + if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + + sizeof(__be16)))) return -ENOMEM; - qp = (struct qtag_prefix *) skb->data; - key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); - __skb_pull(skb, sizeof(struct qtag_prefix)); + vh = (struct vlan_head *)skb->data; + key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); + key_vh->tpid = vh->tpid; + + __skb_pull(skb, sizeof(struct vlan_head)); + return 1; +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int res; + + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + key->eth.cvlan.tci = 0; + key->eth.cvlan.tpid = 0; + + if (likely(skb_vlan_tag_present(skb))) { + key->eth.vlan.tci = htons(skb->vlan_tci); + key->eth.vlan.tpid = skb->vlan_proto; + } else { + /* Parse outer vlan tag in the non-accelerated case. */ + res = parse_vlan_tag(skb, &key->eth.vlan); + if (res <= 0) + return res; + } + + /* Parse inner vlan tag. */ + res = parse_vlan_tag(skb, &key->eth.cvlan); + if (res <= 0) + return res; return 0; } @@ -480,12 +513,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * update skb->csum here. */ - key->eth.tci = 0; - if (skb_vlan_tag_present(skb)) - key->eth.tci = htons(skb->vlan_tci); - else if (eth->h_proto == htons(ETH_P_8021Q)) - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; key->eth.type = parse_ethertype(skb); if (unlikely(key->eth.type == htons(0))) diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 03378e75a67c..156a3029c17b 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -50,6 +50,11 @@ struct ovs_tunnel_info { struct metadata_dst *tun_dst; }; +struct vlan_head { + __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ +}; + #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) @@ -69,7 +74,8 @@ struct sw_flow_key { struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + struct vlan_head vlan; + struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; union { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c78a6a1476fb..8efa718ddb5e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb, ip_tunnel_info_af(tun_info)); } +static int encode_vlan_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *a[], + bool is_mask, bool inner) +{ + __be16 tci = 0; + __be16 tpid = 0; + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (likely(!inner)) { + SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask); + } else { + SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask); + } + return 0; +} + +static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) { + /* Not a VLAN. */ + return 0; + } + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (tci) { + OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { + /* Corner case for truncated VLAN header. */ + OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + } + + return 1; +} + +static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + __be16 tpid = 0; + bool encap_valid = !!(match->key->eth.vlan.tci & + htons(VLAN_TAG_PRESENT)); + bool i_encap_valid = !!(match->key->eth.cvlan.tci & + htons(VLAN_TAG_PRESENT)); + + if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { + /* Not a VLAN. */ + return 0; + } + + if ((!inner && !encap_valid) || (inner && !i_encap_valid)) { + OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (tpid != htons(0xffff)) { + OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).", + (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); + return -EINVAL; + } + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + return 1; +} + +static int __parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, bool inner, + const struct nlattr **a, bool is_mask, + bool log) +{ + int err; + const struct nlattr *encap; + + if (!is_mask) + err = validate_vlan_from_nlattrs(match, *key_attrs, inner, + a, log); + else + err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner, + a, log); + if (err <= 0) + return err; + + err = encode_vlan_from_nlattrs(match, a, is_mask, inner); + if (err) + return err; + + *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + + encap = a[OVS_KEY_ATTR_ENCAP]; + + if (!is_mask) + err = parse_flow_nlattrs(encap, a, key_attrs, log); + else + err = parse_flow_mask_nlattrs(encap, a, key_attrs, log); + + return err; +} + +static int parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + bool encap_valid = false; + + err = __parse_vlan_from_nlattrs(match, key_attrs, false, a, + is_mask, log); + if (err) + return err; + + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)); + if (encap_valid) { + err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, + is_mask, log); + if (err) + return err; + } + + return 0; +} + static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask, bool log) @@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, } if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - __be16 tci; - - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - if (is_mask) - OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); - else - OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); - - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + /* VLAN attribute is always parsed before getting here since it + * may occur multiple times. + */ + OVS_NLERR(log, "VLAN attribute unexpected."); + return -EINVAL; } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { @@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct nlattr *encap; struct nlattr *newmask = NULL; u64 key_attrs = 0; u64 mask_attrs = 0; - bool encap_valid = false; int err; err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); if (err) return err; - if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { - __be16 tci; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR(log, "Invalid Vlan frame."); - return -EINVAL; - } - - key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - encap = a[OVS_KEY_ATTR_ENCAP]; - key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - encap_valid = true; - - if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs, log); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) { - OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); - return -EINVAL; - } - } else { - OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); - return -EINVAL; - } - } + err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log); + if (err) + return err; err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) @@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, goto free_newmask; /* Always match on tci. */ - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); - - if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { - __be16 eth_type = 0; - __be16 tci = 0; - - if (!encap_valid) { - OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); - err = -EINVAL; - goto free_newmask; - } - - mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - if (a[OVS_KEY_ATTR_ETHERTYPE]) - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (eth_type == htons(0xffff)) { - mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, - &mask_attrs, log); - if (err) - goto free_newmask; - } else { - OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", - ntohs(eth_type)); - err = -EINVAL; - goto free_newmask; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", - ntohs(tci)); - err = -EINVAL; - goto free_newmask; - } - } + err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log); + if (err) + goto free_newmask; err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, log); @@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } +static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, + bool is_mask) +{ + __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff); + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci)) + return -EMSGSIZE; + return 0; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; + struct nlattr *nla; + struct nlattr *encap = NULL; + struct nlattr *in_encap = NULL; if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) goto nla_put_failure; @@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(eth_key->eth_src, output->eth.src); ether_addr_copy(eth_key->eth_dst, output->eth.dst); - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - __be16 eth_type; - eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) + if (!swkey->eth.vlan.tci) goto unencap; - } else - encap = NULL; + + if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + goto nla_put_failure; + in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.cvlan.tci) + goto unencap; + } + } if (swkey->eth.type == htons(ETH_P_802_2)) { /* @@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; + if (eth_type_vlan(swkey->eth.type)) { + /* There are 3 VLAN tags, we don't know anything about the rest + * of the packet, so truncate here. + */ + WARN_ON_ONCE(!(encap && in_encap)); + goto unencap; + } + if (swkey->eth.type == htons(ETH_P_IP)) { struct ovs_key_ipv4 *ipv4_key; @@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, } unencap: + if (in_encap) + nla_nest_end(skb, in_encap); if (encap) nla_nest_end(skb, encap); @@ -2283,7 +2397,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_PUSH_VLAN: vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; @@ -2388,7 +2502,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, - key->eth.tci, log); + key->eth.vlan.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6b21fd068d87..8f198437c724 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -485,9 +485,14 @@ static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; - if (skb->protocol == htons(ETH_P_8021Q)) + if (skb_vlan_tagged(skb)) length -= VLAN_HLEN; + /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow + * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none + * account for 802.1ad. e.g. is_skb_forwardable(). + */ + return length; } diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index f3508aa75815..807158e32f5f 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, int portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct list_head *addr_list; @@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r->idiag_retrans = 0; } - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { @@ -203,6 +204,7 @@ struct sctp_comm_param { struct netlink_callback *cb; const struct inet_diag_req_v2 *r; const struct nlmsghdr *nlh; + bool net_admin; }; static size_t inet_assoc_attr_size(struct sctp_association *asoc) @@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_meminfo)) @@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + commp->net_admin); release_sock(sk); if (err < 0) { WARN_ON(err == -EMSGSIZE); @@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh) < 0) { + NLM_F_MULTI, cb->nlh, + commp->net_admin) < 0) { cb->args[3] = 1; err = 2; goto release; @@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) if (inet_sctp_diag_fill(sk, assoc, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { + cb->nlh->nlmsg_seq, 0, cb->nlh, + commp->net_admin) < 0) { err = 2; goto release; } @@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh) < 0) { + cb->nlh, commp->net_admin) < 0) { err = 2; goto out; } @@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb, .skb = in_skb, .r = req, .nlh = nlh, + .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN), }; if (req->sdiag_family == AF_INET) { @@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .skb = skb, .cb = cb, .r = r, + .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; /* eps hashtable dumps diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index a750f330b8dd..f83b74d3e2ac 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1500,12 +1500,8 @@ out_fac_release: goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; - if (dtefacs.calling_ae == NULL) - goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; - if (dtefacs.called_ae == NULL) - goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b5e665b3cfb0..f7ce6265961a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] __read_mostly; static struct kmem_cache *xfrm_dst_cache __read_mostly; +static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); @@ -59,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) +{ + return atomic_inc_not_zero(&policy->refcnt); +} + static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { @@ -385,9 +391,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net, __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); - return (hash == hmask + 1 ? - &net->xfrm.policy_inexact[dir] : - net->xfrm.policy_bydst[dir].table + hash); + if (hash == hmask + 1) + return &net->xfrm.policy_inexact[dir]; + + return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, @@ -403,7 +411,8 @@ static struct hlist_head *policy_hash_direct(struct net *net, __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); - return net->xfrm.policy_bydst[dir].table + hash; + return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, @@ -426,14 +435,14 @@ redo: h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0) { - hlist_del(&pol->bydst); - hlist_add_head(&pol->bydst, ndsttable+h); + hlist_del_rcu(&pol->bydst); + hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; - hlist_del(&pol->bydst); - hlist_add_behind(&pol->bydst, entry0); + hlist_del_rcu(&pol->bydst); + hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -468,22 +477,32 @@ static void xfrm_bydst_resize(struct net *net, int dir) unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); - struct hlist_head *odst = net->xfrm.policy_bydst[dir].table; struct hlist_head *ndst = xfrm_hash_alloc(nsize); + struct hlist_head *odst; int i; if (!ndst) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_begin(&xfrm_policy_hash_generation); + + odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); + + odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); - net->xfrm.policy_bydst[dir].table = ndst; + rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_end(&xfrm_policy_hash_generation); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + + synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } @@ -500,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total) if (!nidx) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); @@ -508,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total) net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } @@ -541,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total) void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { - read_lock_bh(&net->xfrm.xfrm_policy_lock); si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; @@ -550,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; - read_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_spd_getinfo); @@ -600,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { @@ -642,7 +659,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head(&policy->bydst, chain); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } @@ -753,7 +770,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct hlist_head *chain; struct hlist_node *newpos; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; @@ -764,7 +781,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return -EEXIST; } delpol = pol; @@ -800,7 +817,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); @@ -820,7 +837,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, struct hlist_head *chain; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { @@ -833,7 +850,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -842,7 +859,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -861,7 +878,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, return NULL; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { @@ -872,7 +889,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -881,7 +898,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -939,7 +956,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) @@ -955,14 +972,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again1; } @@ -974,13 +991,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again2; } } @@ -989,7 +1006,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (!cnt) err = -ESRCH; out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@ -1009,7 +1026,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else @@ -1037,7 +1054,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, } list_del_init(&walk->walk.all); out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); @@ -1056,9 +1073,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) if (list_empty(&walk->walk.all)) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); @@ -1096,17 +1113,24 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, struct xfrm_policy *pol, *ret; const xfrm_address_t *daddr, *saddr; struct hlist_head *chain; - u32 priority = ~0U; + unsigned int sequence; + u32 priority; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; - read_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_direct(net, daddr, saddr, family, dir); + rcu_read_lock(); + retry: + do { + sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + chain = policy_hash_direct(net, daddr, saddr, family, dir); + } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + + priority = ~0U; ret = NULL; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -1122,7 +1146,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry_rcu(pol, chain, bydst) { if ((pol->priority >= priority) && ret) break; @@ -1140,9 +1164,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } - xfrm_pol_hold(ret); + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + goto retry; + + if (ret && !xfrm_pol_hold_rcu(ret)) + goto retry; fail: - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return ret; } @@ -1219,10 +1247,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl) { struct xfrm_policy *pol; - struct net *net = sock_net(sk); rcu_read_lock(); - read_lock_bh(&net->xfrm.xfrm_policy_lock); + again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, @@ -1237,8 +1264,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, policy_to_flow_dir(dir)); - if (!err) - xfrm_pol_hold(pol); + if (!err && !xfrm_pol_hold_rcu(pol)) + goto again; else if (err == -ESRCH) pol = NULL; else @@ -1247,7 +1274,6 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, pol = NULL; } out: - read_unlock_bh(&net->xfrm.xfrm_policy_lock); rcu_read_unlock(); return pol; } @@ -1271,7 +1297,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { - hlist_del(&pol->bydst); + hlist_del_rcu(&pol->bydst); hlist_del(&pol->byidx); } @@ -1295,9 +1321,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; @@ -1316,7 +1342,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) return -EINVAL; #endif - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { @@ -1334,7 +1360,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) */ xfrm_sk_policy_unlink(old_pol, dir); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); @@ -1364,9 +1390,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->type = old->type; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; @@ -3048,7 +3074,7 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); - rwlock_init(&net->xfrm.xfrm_policy_lock); + spin_lock_init(&net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); return 0; @@ -3082,6 +3108,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); + seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); } @@ -3179,7 +3206,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * struct hlist_head *chain; u32 priority = ~0U; - read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && @@ -3203,7 +3230,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * xfrm_pol_hold(ret); - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 4fd725a0c500..cdc2e2e71bff 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) x->repl->notify(x, XFRM_REPLAY_UPDATE); } -static struct xfrm_replay xfrm_replay_legacy = { +static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, @@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = { .overflow = xfrm_replay_overflow, }; -static struct xfrm_replay xfrm_replay_bmp = { +static const struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, @@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = { .overflow = xfrm_replay_overflow_bmp, }; -static struct xfrm_replay xfrm_replay_esn = { +static const struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9895a8c56d8c..ba8bf518ba14 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -28,6 +28,11 @@ #include "xfrm_hash.h" +#define xfrm_state_deref_prot(table, net) \ + rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) + +static void xfrm_state_gc_task(struct work_struct *work); + /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) @@ -36,6 +41,15 @@ */ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; +static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation); + +static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); +static HLIST_HEAD(xfrm_state_gc_list); + +static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) +{ + return atomic_inc_not_zero(&x->refcnt); +} static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, @@ -76,18 +90,18 @@ static void xfrm_hash_transfer(struct hlist_head *list, h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); - hlist_add_head(&x->bydst, ndsttable+h); + hlist_add_head_rcu(&x->bydst, ndsttable + h); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); - hlist_add_head(&x->bysrc, nsrctable+h); + hlist_add_head_rcu(&x->bysrc, nsrctable + h); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); - hlist_add_head(&x->byspi, nspitable+h); + hlist_add_head_rcu(&x->byspi, nspitable + h); } } } @@ -122,25 +136,29 @@ static void xfrm_hash_resize(struct work_struct *work) } spin_lock_bh(&net->xfrm.xfrm_state_lock); + write_seqcount_begin(&xfrm_state_hash_generation); nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; + odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) - xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi, - nhashmask); + xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask); - odst = net->xfrm.state_bydst; - osrc = net->xfrm.state_bysrc; - ospi = net->xfrm.state_byspi; + osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); + ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); ohashmask = net->xfrm.state_hmask; - net->xfrm.state_bydst = ndst; - net->xfrm.state_bysrc = nsrc; - net->xfrm.state_byspi = nspi; + rcu_assign_pointer(net->xfrm.state_bydst, ndst); + rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); + rcu_assign_pointer(net->xfrm.state_byspi, nspi); net->xfrm.state_hmask = nhashmask; + write_seqcount_end(&xfrm_state_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_state_lock); osize = (ohashmask + 1) * sizeof(struct hlist_head); + + synchronize_rcu(); + xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); @@ -355,15 +373,16 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) static void xfrm_state_gc_task(struct work_struct *work) { - struct net *net = container_of(work, struct net, xfrm.state_gc_work); struct xfrm_state *x; struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); - hlist_move_list(&net->xfrm.state_gc_list, &gc_list); + hlist_move_list(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); + synchronize_rcu(); + hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) xfrm_state_gc_destroy(x); } @@ -500,14 +519,12 @@ EXPORT_SYMBOL(xfrm_state_alloc); void __xfrm_state_destroy(struct xfrm_state *x) { - struct net *net = xs_net(x); - WARN_ON(x->km.state != XFRM_STATE_DEAD); spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &net->xfrm.state_gc_list); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&net->xfrm.state_gc_work); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -520,10 +537,10 @@ int __xfrm_state_delete(struct xfrm_state *x) x->km.state = XFRM_STATE_DEAD; spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); - hlist_del(&x->bydst); - hlist_del(&x->bysrc); + hlist_del_rcu(&x->bydst); + hlist_del_rcu(&x->bysrc); if (x->id.spi) - hlist_del(&x->byspi); + hlist_del_rcu(&x->byspi); net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); @@ -659,7 +676,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) { + hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -668,7 +685,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, if ((mark & x->mark.m) != x->mark.v) continue; - xfrm_state_hold(x); + if (!xfrm_state_hold_rcu(x)) + continue; return x; } @@ -683,7 +701,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, unsigned int h = xfrm_src_hash(net, daddr, saddr, family); struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -692,7 +710,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, if ((mark & x->mark.m) != x->mark.v) continue; - xfrm_state_hold(x); + if (!xfrm_state_hold_rcu(x)) + continue; return x; } @@ -775,13 +794,16 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; + unsigned int sequence; struct km_event c; to_put = NULL; - spin_lock_bh(&net->xfrm.xfrm_state_lock); + sequence = read_seqcount_begin(&xfrm_state_hash_generation); + + rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -797,7 +819,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, goto found; h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -850,19 +872,21 @@ found: } if (km_query(x, tmpl, pol) == 0) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, daddr, saddr, encap_family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -871,13 +895,26 @@ found: } } out: - if (x) - xfrm_state_hold(x); - else + if (x) { + if (!xfrm_state_hold_rcu(x)) { + *err = -EAGAIN; + x = NULL; + } + } else { *err = acquire_in_progress ? -EAGAIN : error; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); + + if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) { + *err = -EAGAIN; + if (x) { + xfrm_state_put(x); + x = NULL; + } + } + return x; } @@ -945,16 +982,16 @@ static void __xfrm_state_insert(struct xfrm_state *x) h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); @@ -1063,9 +1100,9 @@ static struct xfrm_state *__find_acq_core(struct net *net, xfrm_state_hold(x); tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, daddr, saddr, family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); net->xfrm.state_num++; @@ -1581,7 +1618,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) if (x->id.spi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; @@ -2099,8 +2136,6 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); - INIT_HLIST_HEAD(&net->xfrm.state_gc_list); - INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task); spin_lock_init(&net->xfrm.xfrm_state_lock); return 0; @@ -2118,7 +2153,7 @@ void xfrm_state_fini(struct net *net) flush_work(&net->xfrm.state_hash_work); xfrm_state_flush(net, IPSEC_PROTO_ANY, false); - flush_work(&net->xfrm.state_gc_work); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index 78c6f131d94f..1f6cc9b6a23b 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -1529,6 +1529,108 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { + "direct packet access: test5 (pkt_end >= reg, good access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test6 (pkt_end >= reg, bad access)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid access to packet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test7 (pkt_end >= reg, both accesses)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid access to packet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test8 (double test, variant 1)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 4), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "direct packet access: test9 (double test, variant 2)", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { "helper access to packet: test1, valid packet_ptr range", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, |