diff options
-rw-r--r-- | Documentation/networking/mpls-sysctl.txt | 20 | ||||
-rw-r--r-- | drivers/net/ethernet/cadence/Kconfig | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/emulex/benet/be.h | 9 | ||||
-rw-r--r-- | drivers/net/ethernet/emulex/benet/be_cmds.c | 114 | ||||
-rw-r--r-- | drivers/net/ethernet/emulex/benet/be_cmds.h | 23 | ||||
-rw-r--r-- | drivers/net/ethernet/emulex/benet/be_ethtool.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/emulex/benet/be_main.c | 260 | ||||
-rw-r--r-- | drivers/net/ethernet/renesas/sh_eth.c | 259 | ||||
-rw-r--r-- | drivers/net/ethernet/renesas/sh_eth.h | 23 | ||||
-rw-r--r-- | include/linux/socket.h | 2 | ||||
-rw-r--r-- | include/net/arp.h | 19 | ||||
-rw-r--r-- | include/net/ndisc.h | 19 | ||||
-rw-r--r-- | include/net/neighbour.h | 55 | ||||
-rw-r--r-- | include/net/net_namespace.h | 4 | ||||
-rw-r--r-- | include/net/netns/mpls.h | 17 | ||||
-rw-r--r-- | include/uapi/linux/rtnetlink.h | 10 | ||||
-rw-r--r-- | net/Makefile | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 54 | ||||
-rw-r--r-- | net/decnet/dn_neigh.c | 6 | ||||
-rw-r--r-- | net/ipv4/arp.c | 9 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 7 | ||||
-rw-r--r-- | net/mpls/Kconfig | 23 | ||||
-rw-r--r-- | net/mpls/Makefile | 1 | ||||
-rw-r--r-- | net/mpls/af_mpls.c | 974 | ||||
-rw-r--r-- | net/mpls/internal.h | 59 |
25 files changed, 1773 insertions, 202 deletions
diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt new file mode 100644 index 000000000000..639ddf0ece9b --- /dev/null +++ b/Documentation/networking/mpls-sysctl.txt @@ -0,0 +1,20 @@ +/proc/sys/net/mpls/* Variables: + +platform_labels - INTEGER + Number of entries in the platform label table. It is not + possible to configure forwarding for label values equal to or + greater than the number of platform labels. + + A dense utliziation of the entries in the platform label table + is possible and expected aas the platform labels are locally + allocated. + + If the number of platform label table entries is set to 0 no + label will be recognized by the kernel and mpls forwarding + will be disabled. + + Reducing this value will remove all label routing entries that + no longer fit in the table. + + Possible values: 0 - 1048575 + Default: 0 diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig index 321d2ad235d9..739bb0048ebf 100644 --- a/drivers/net/ethernet/cadence/Kconfig +++ b/drivers/net/ethernet/cadence/Kconfig @@ -4,7 +4,7 @@ config NET_CADENCE bool "Cadence devices" - depends on HAS_IOMEM && (ARM || AVR32 || MICROBLAZE || COMPILE_TEST) + depends on HAS_IOMEM default y ---help--- If you have a network (Ethernet) card belonging to this class, say Y. @@ -30,7 +30,7 @@ config ARM_AT91_ETHER config MACB tristate "Cadence MACB/GEM support" - depends on HAS_DMA && (PLATFORM_AT32AP || ARCH_AT91 || ARCH_PICOXCELL || ARCH_ZYNQ || MICROBLAZE || COMPILE_TEST) + depends on HAS_DMA select PHYLIB ---help--- The Cadence MACB ethernet interface is found on many Atmel AT32 and diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index fac806a15a61..996bbc6a244f 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -87,6 +87,7 @@ #define BE3_MAX_EVT_QS 16 #define BE3_SRIOV_MAX_EVT_QS 8 +#define MAX_RSS_IFACES 15 #define MAX_RX_QS 32 #define MAX_EVT_QS 32 #define MAX_TX_QS 32 @@ -411,8 +412,11 @@ struct be_resources { u16 max_tx_qs; u16 max_rss_qs; u16 max_rx_qs; + u16 max_cq_count; u16 max_uc_mac; /* Max UC MACs programmable */ u16 max_vlans; /* Number of vlans supported */ + u16 max_iface_count; + u16 max_mcc_count; u16 max_evt_qs; u32 if_cap_flags; u32 vf_if_cap_flags; /* VF if capability flags */ @@ -488,6 +492,8 @@ struct be_adapter { /* Rx rings */ u16 num_rx_qs; + u16 num_rss_qs; + u16 need_def_rxq; struct be_rx_obj rx_obj[MAX_RX_QS]; u32 big_page_size; /* Compounded page size shared by rx wrbs */ @@ -635,9 +641,8 @@ extern const struct ethtool_ops be_ethtool_ops; for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs; \ i++, rxo++) -/* Skip the default non-rss queue (last one)*/ #define for_all_rss_queues(adapter, rxo, i) \ - for (i = 0, rxo = &adapter->rx_obj[i]; i < (adapter->num_rx_qs - 1);\ + for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rss_qs; \ i++, rxo++) #define for_all_tx_queues(adapter, txo, i) \ diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index be00695b3be7..75cb4610423b 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -3577,12 +3577,12 @@ static void be_copy_nic_desc(struct be_resources *res, res->max_rss_qs = le16_to_cpu(desc->rssq_count); res->max_rx_qs = le16_to_cpu(desc->rq_count); res->max_evt_qs = le16_to_cpu(desc->eq_count); + res->max_cq_count = le16_to_cpu(desc->cq_count); + res->max_iface_count = le16_to_cpu(desc->iface_count); + res->max_mcc_count = le16_to_cpu(desc->mcc_count); /* Clear flags that driver is not interested in */ res->if_cap_flags = le32_to_cpu(desc->cap_flags) & BE_IF_CAP_FLAGS_WANT; - /* Need 1 RXQ as the default RXQ */ - if (res->max_rss_qs && res->max_rss_qs == res->max_rx_qs) - res->max_rss_qs -= 1; } /* Uses Mbox */ @@ -3644,7 +3644,7 @@ err: /* Will use MBOX only if MCCQ has not been created */ int be_cmd_get_profile_config(struct be_adapter *adapter, - struct be_resources *res, u8 domain) + struct be_resources *res, u8 query, u8 domain) { struct be_cmd_resp_get_profile_config *resp; struct be_cmd_req_get_profile_config *req; @@ -3654,7 +3654,7 @@ int be_cmd_get_profile_config(struct be_adapter *adapter, struct be_nic_res_desc *nic; struct be_mcc_wrb wrb = {0}; struct be_dma_mem cmd; - u32 desc_count; + u16 desc_count; int status; memset(&cmd, 0, sizeof(struct be_dma_mem)); @@ -3673,12 +3673,19 @@ int be_cmd_get_profile_config(struct be_adapter *adapter, req->hdr.version = 1; req->type = ACTIVE_PROFILE_TYPE; + /* When QUERY_MODIFIABLE_FIELDS_TYPE bit is set, cmd returns the + * descriptors with all bits set to "1" for the fields which can be + * modified using SET_PROFILE_CONFIG cmd. + */ + if (query == RESOURCE_MODIFIABLE) + req->type |= QUERY_MODIFIABLE_FIELDS_TYPE; + status = be_cmd_notify_wait(adapter, &wrb); if (status) goto err; resp = cmd.va; - desc_count = le32_to_cpu(resp->desc_count); + desc_count = le16_to_cpu(resp->desc_count); pcie = be_get_pcie_desc(adapter->pdev->devfn, resp->func_param, desc_count); @@ -3803,23 +3810,80 @@ int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, u16 link_speed, 1, version, domain); } +static void be_fill_vf_res_template(struct be_adapter *adapter, + struct be_resources pool_res, + u16 num_vfs, u16 num_vf_qs, + struct be_nic_res_desc *nic_vft) +{ + u32 vf_if_cap_flags = pool_res.vf_if_cap_flags; + struct be_resources res_mod = {0}; + + /* Resource with fields set to all '1's by GET_PROFILE_CONFIG cmd, + * which are modifiable using SET_PROFILE_CONFIG cmd. + */ + be_cmd_get_profile_config(adapter, &res_mod, RESOURCE_MODIFIABLE, 0); + + /* If RSS IFACE capability flags are modifiable for a VF, set the + * capability flag as valid and set RSS and DEFQ_RSS IFACE flags if + * more than 1 RSSQ is available for a VF. + * Otherwise, provision only 1 queue pair for VF. + */ + if (res_mod.vf_if_cap_flags & BE_IF_FLAGS_RSS) { + nic_vft->flags |= BIT(IF_CAPS_FLAGS_VALID_SHIFT); + if (num_vf_qs > 1) { + vf_if_cap_flags |= BE_IF_FLAGS_RSS; + if (pool_res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS) + vf_if_cap_flags |= BE_IF_FLAGS_DEFQ_RSS; + } else { + vf_if_cap_flags &= ~(BE_IF_FLAGS_RSS | + BE_IF_FLAGS_DEFQ_RSS); + } + + nic_vft->cap_flags = cpu_to_le32(vf_if_cap_flags); + } else { + num_vf_qs = 1; + } + + nic_vft->rq_count = cpu_to_le16(num_vf_qs); + nic_vft->txq_count = cpu_to_le16(num_vf_qs); + nic_vft->rssq_count = cpu_to_le16(num_vf_qs); + nic_vft->cq_count = cpu_to_le16(pool_res.max_cq_count / + (num_vfs + 1)); + + /* Distribute unicast MACs, VLANs, IFACE count and MCCQ count equally + * among the PF and it's VFs, if the fields are changeable + */ + if (res_mod.max_uc_mac == FIELD_MODIFIABLE) + nic_vft->unicast_mac_count = cpu_to_le16(pool_res.max_uc_mac / + (num_vfs + 1)); + + if (res_mod.max_vlans == FIELD_MODIFIABLE) + nic_vft->vlan_count = cpu_to_le16(pool_res.max_vlans / + (num_vfs + 1)); + + if (res_mod.max_iface_count == FIELD_MODIFIABLE) + nic_vft->iface_count = cpu_to_le16(pool_res.max_iface_count / + (num_vfs + 1)); + + if (res_mod.max_mcc_count == FIELD_MODIFIABLE) + nic_vft->mcc_count = cpu_to_le16(pool_res.max_mcc_count / + (num_vfs + 1)); +} + int be_cmd_set_sriov_config(struct be_adapter *adapter, - struct be_resources res, u16 num_vfs) + struct be_resources pool_res, u16 num_vfs, + u16 num_vf_qs) { struct { struct be_pcie_res_desc pcie; struct be_nic_res_desc nic_vft; } __packed desc; - u16 vf_q_count; - - if (BEx_chip(adapter) || lancer_chip(adapter)) - return 0; /* PF PCIE descriptor */ be_reset_pcie_desc(&desc.pcie); desc.pcie.hdr.desc_type = PCIE_RESOURCE_DESC_TYPE_V1; desc.pcie.hdr.desc_len = RESOURCE_DESC_SIZE_V1; - desc.pcie.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT); + desc.pcie.flags = BIT(IMM_SHIFT) | BIT(NOSV_SHIFT); desc.pcie.pf_num = adapter->pdev->devfn; desc.pcie.sriov_state = num_vfs ? 1 : 0; desc.pcie.num_vfs = cpu_to_le16(num_vfs); @@ -3828,32 +3892,12 @@ int be_cmd_set_sriov_config(struct be_adapter *adapter, be_reset_nic_desc(&desc.nic_vft); desc.nic_vft.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1; desc.nic_vft.hdr.desc_len = RESOURCE_DESC_SIZE_V1; - desc.nic_vft.flags = (1 << VFT_SHIFT) | (1 << IMM_SHIFT) | - (1 << NOSV_SHIFT); + desc.nic_vft.flags = BIT(VFT_SHIFT) | BIT(IMM_SHIFT) | BIT(NOSV_SHIFT); desc.nic_vft.pf_num = adapter->pdev->devfn; desc.nic_vft.vf_num = 0; - if (num_vfs && res.vf_if_cap_flags & BE_IF_FLAGS_RSS) { - /* If number of VFs requested is 8 less than max supported, - * assign 8 queue pairs to the PF and divide the remaining - * resources evenly among the VFs - */ - if (num_vfs < (be_max_vfs(adapter) - 8)) - vf_q_count = (res.max_rss_qs - 8) / num_vfs; - else - vf_q_count = res.max_rss_qs / num_vfs; - - desc.nic_vft.rq_count = cpu_to_le16(vf_q_count); - desc.nic_vft.txq_count = cpu_to_le16(vf_q_count); - desc.nic_vft.rssq_count = cpu_to_le16(vf_q_count - 1); - desc.nic_vft.cq_count = cpu_to_le16(3 * vf_q_count); - } else { - desc.nic_vft.txq_count = cpu_to_le16(1); - desc.nic_vft.rq_count = cpu_to_le16(1); - desc.nic_vft.rssq_count = cpu_to_le16(0); - /* One CQ for each TX, RX and MCCQ */ - desc.nic_vft.cq_count = cpu_to_le16(3); - } + be_fill_vf_res_template(adapter, pool_res, num_vfs, num_vf_qs, + &desc.nic_vft); return be_cmd_set_profile_config(adapter, &desc, 2 * RESOURCE_DESC_SIZE_V1, 2, 1, 0); diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index db761e8e42a3..53e903f37247 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -588,14 +588,15 @@ enum be_if_flags { BE_IF_FLAGS_MCAST_PROMISCUOUS = 0x200, BE_IF_FLAGS_PASS_L2_ERRORS = 0x400, BE_IF_FLAGS_PASS_L3L4_ERRORS = 0x800, - BE_IF_FLAGS_MULTICAST = 0x1000 + BE_IF_FLAGS_MULTICAST = 0x1000, + BE_IF_FLAGS_DEFQ_RSS = 0x1000000 }; #define BE_IF_CAP_FLAGS_WANT (BE_IF_FLAGS_RSS | BE_IF_FLAGS_PROMISCUOUS |\ BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_VLAN_PROMISCUOUS |\ BE_IF_FLAGS_VLAN | BE_IF_FLAGS_MCAST_PROMISCUOUS |\ BE_IF_FLAGS_PASS_L3L4_ERRORS | BE_IF_FLAGS_MULTICAST |\ - BE_IF_FLAGS_UNTAGGED) + BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_DEFQ_RSS) #define BE_IF_FLAGS_ALL_PROMISCUOUS (BE_IF_FLAGS_PROMISCUOUS | \ BE_IF_FLAGS_VLAN_PROMISCUOUS |\ @@ -2021,6 +2022,7 @@ struct be_cmd_req_set_ext_fat_caps { #define PORT_RESOURCE_DESC_TYPE_V1 0x55 #define MAX_RESOURCE_DESC 264 +#define IF_CAPS_FLAGS_VALID_SHIFT 0 /* IF caps valid */ #define VFT_SHIFT 3 /* VF template */ #define IMM_SHIFT 6 /* Immediate */ #define NOSV_SHIFT 7 /* No save */ @@ -2131,20 +2133,28 @@ struct be_cmd_resp_get_func_config { u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; }; -#define ACTIVE_PROFILE_TYPE 0x2 +enum { + RESOURCE_LIMITS, + RESOURCE_MODIFIABLE +}; + struct be_cmd_req_get_profile_config { struct be_cmd_req_hdr hdr; u8 rsvd; +#define ACTIVE_PROFILE_TYPE 0x2 +#define QUERY_MODIFIABLE_FIELDS_TYPE BIT(3) u8 type; u16 rsvd1; }; struct be_cmd_resp_get_profile_config { struct be_cmd_resp_hdr hdr; - u32 desc_count; + __le16 desc_count; + u16 rsvd; u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; }; +#define FIELD_MODIFIABLE 0xFFFF struct be_cmd_req_set_profile_config { struct be_cmd_req_hdr hdr; u32 rsvd; @@ -2344,7 +2354,7 @@ int be_cmd_query_port_name(struct be_adapter *adapter); int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res); int be_cmd_get_profile_config(struct be_adapter *adapter, - struct be_resources *res, u8 domain); + struct be_resources *res, u8 query, u8 domain); int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile); int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg, int vf_num); @@ -2355,4 +2365,5 @@ int be_cmd_set_logical_link_config(struct be_adapter *adapter, int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port); int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op); int be_cmd_set_sriov_config(struct be_adapter *adapter, - struct be_resources res, u16 num_vfs); + struct be_resources res, u16 num_vfs, + u16 num_vf_qs); diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index 4d2de4700769..b765c24625bf 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -1097,7 +1097,7 @@ static int be_set_rss_hash_opts(struct be_adapter *adapter, return status; if (be_multi_rxq(adapter)) { - for (j = 0; j < 128; j += adapter->num_rx_qs - 1) { + for (j = 0; j < 128; j += adapter->num_rss_qs) { for_all_rss_queues(adapter, rxo, i) { if ((j + i) >= 128) break; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7eccebc676e2..5652b005947f 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -30,6 +30,9 @@ MODULE_DESCRIPTION(DRV_DESC " " DRV_VER); MODULE_AUTHOR("Emulex Corporation"); MODULE_LICENSE("GPL"); +/* num_vfs module param is obsolete. + * Use sysfs method to enable/disable VFs. + */ static unsigned int num_vfs; module_param(num_vfs, uint, S_IRUGO); MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize"); @@ -2454,13 +2457,19 @@ static int be_rx_cqs_create(struct be_adapter *adapter) int rc, i; /* We can create as many RSS rings as there are EQs. */ - adapter->num_rx_qs = adapter->num_evt_qs; + adapter->num_rss_qs = adapter->num_evt_qs; + + /* We'll use RSS only if atleast 2 RSS rings are supported. */ + if (adapter->num_rss_qs <= 1) + adapter->num_rss_qs = 0; - /* We'll use RSS only if atleast 2 RSS rings are supported. - * When RSS is used, we'll need a default RXQ for non-IP traffic. + adapter->num_rx_qs = adapter->num_rss_qs + adapter->need_def_rxq; + + /* When the interface is not capable of RSS rings (and there is no + * need to create a default RXQ) we'll still need one RXQ */ - if (adapter->num_rx_qs > 1) - adapter->num_rx_qs++; + if (adapter->num_rx_qs == 0) + adapter->num_rx_qs = 1; adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE; for_all_rx_queues(adapter, rxo, i) { @@ -2479,8 +2488,7 @@ static int be_rx_cqs_create(struct be_adapter *adapter) } dev_info(&adapter->pdev->dev, - "created %d RSS queue(s) and 1 default RX queue\n", - adapter->num_rx_qs - 1); + "created %d RX queue(s)\n", adapter->num_rx_qs); return 0; } @@ -3110,12 +3118,14 @@ static int be_rx_qs_create(struct be_adapter *adapter) return rc; } - /* The FW would like the default RXQ to be created first */ - rxo = default_rxo(adapter); - rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size, - adapter->if_handle, false, &rxo->rss_id); - if (rc) - return rc; + if (adapter->need_def_rxq || !adapter->num_rss_qs) { + rxo = default_rxo(adapter); + rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, + rx_frag_size, adapter->if_handle, + false, &rxo->rss_id); + if (rc) + return rc; + } for_all_rss_queues(adapter, rxo, i) { rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, @@ -3126,8 +3136,7 @@ static int be_rx_qs_create(struct be_adapter *adapter) } if (be_multi_rxq(adapter)) { - for (j = 0; j < RSS_INDIR_TABLE_LEN; - j += adapter->num_rx_qs - 1) { + for (j = 0; j < RSS_INDIR_TABLE_LEN; j += adapter->num_rss_qs) { for_all_rss_queues(adapter, rxo, i) { if ((j + i) >= RSS_INDIR_TABLE_LEN) break; @@ -3402,8 +3411,39 @@ static void be_disable_vxlan_offloads(struct be_adapter *adapter) } #endif +static u16 be_calculate_vf_qs(struct be_adapter *adapter, u16 num_vfs) +{ + struct be_resources res = adapter->pool_res; + u16 num_vf_qs = 1; + + /* Distribute the queue resources equally among the PF and it's VFs + * Do not distribute queue resources in multi-channel configuration. + */ + if (num_vfs && !be_is_mc(adapter)) { + /* If number of VFs requested is 8 less than max supported, + * assign 8 queue pairs to the PF and divide the remaining + * resources evenly among the VFs + */ + if (num_vfs < (be_max_vfs(adapter) - 8)) + num_vf_qs = (res.max_rss_qs - 8) / num_vfs; + else + num_vf_qs = res.max_rss_qs / num_vfs; + + /* Skyhawk-R chip supports only MAX_RSS_IFACES RSS capable + * interfaces per port. Provide RSS on VFs, only if number + * of VFs requested is less than MAX_RSS_IFACES limit. + */ + if (num_vfs >= MAX_RSS_IFACES) + num_vf_qs = 1; + } + return num_vf_qs; +} + static int be_clear(struct be_adapter *adapter) { + struct pci_dev *pdev = adapter->pdev; + u16 num_vf_qs; + be_cancel_worker(adapter); if (sriov_enabled(adapter)) @@ -3412,9 +3452,14 @@ static int be_clear(struct be_adapter *adapter) /* Re-configure FW to distribute resources evenly across max-supported * number of VFs, only when VFs are not already enabled. */ - if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev)) + if (skyhawk_chip(adapter) && be_physfn(adapter) && + !pci_vfs_assigned(pdev)) { + num_vf_qs = be_calculate_vf_qs(adapter, + pci_sriov_get_totalvfs(pdev)); be_cmd_set_sriov_config(adapter, adapter->pool_res, - pci_sriov_get_totalvfs(adapter->pdev)); + pci_sriov_get_totalvfs(pdev), + num_vf_qs); + } #ifdef CONFIG_BE2NET_VXLAN be_disable_vxlan_offloads(adapter); @@ -3439,7 +3484,7 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle, en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS | - BE_IF_FLAGS_RSS; + BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS; en_flags &= cap_flags; @@ -3463,6 +3508,7 @@ static int be_vfs_if_create(struct be_adapter *adapter) for_all_vfs(adapter, vf_cfg, vf) { if (!BE3_chip(adapter)) { status = be_cmd_get_profile_config(adapter, &res, + RESOURCE_LIMITS, vf + 1); if (!status) cap_flags = res.if_cap_flags; @@ -3629,7 +3675,8 @@ static void BEx_get_resources(struct be_adapter *adapter, /* On a SuperNIC profile, the driver needs to use the * GET_PROFILE_CONFIG cmd to query the per-function TXQ limits */ - be_cmd_get_profile_config(adapter, &super_nic_res, 0); + be_cmd_get_profile_config(adapter, &super_nic_res, + RESOURCE_LIMITS, 0); /* Some old versions of BE3 FW don't report max_tx_qs value */ res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS; } else { @@ -3649,6 +3696,7 @@ static void BEx_get_resources(struct be_adapter *adapter, res->max_evt_qs = 1; res->if_cap_flags = BE_IF_CAP_FLAGS_WANT; + res->if_cap_flags &= ~BE_IF_FLAGS_DEFQ_RSS; if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS)) res->if_cap_flags &= ~BE_IF_FLAGS_RSS; } @@ -3668,13 +3716,12 @@ static void be_setup_init(struct be_adapter *adapter) static int be_get_sriov_config(struct be_adapter *adapter) { - struct device *dev = &adapter->pdev->dev; struct be_resources res = {0}; int max_vfs, old_vfs; - /* Some old versions of BE3 FW don't report max_vfs value */ - be_cmd_get_profile_config(adapter, &res, 0); + be_cmd_get_profile_config(adapter, &res, RESOURCE_LIMITS, 0); + /* Some old versions of BE3 FW don't report max_vfs value */ if (BE3_chip(adapter) && !res.max_vfs) { max_vfs = pci_sriov_get_totalvfs(adapter->pdev); res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0; @@ -3682,35 +3729,49 @@ static int be_get_sriov_config(struct be_adapter *adapter) adapter->pool_res = res; - if (!be_max_vfs(adapter)) { - if (num_vfs) - dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n"); - adapter->num_vfs = 0; - return 0; - } - - pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter)); - - /* validate num_vfs module param */ + /* If during previous unload of the driver, the VFs were not disabled, + * then we cannot rely on the PF POOL limits for the TotalVFs value. + * Instead use the TotalVFs value stored in the pci-dev struct. + */ old_vfs = pci_num_vf(adapter->pdev); if (old_vfs) { - dev_info(dev, "%d VFs are already enabled\n", old_vfs); - if (old_vfs != num_vfs) - dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs); + dev_info(&adapter->pdev->dev, "%d VFs are already enabled\n", + old_vfs); + + adapter->pool_res.max_vfs = + pci_sriov_get_totalvfs(adapter->pdev); adapter->num_vfs = old_vfs; - } else { - if (num_vfs > be_max_vfs(adapter)) { - dev_info(dev, "Resources unavailable to init %d VFs\n", - num_vfs); - dev_info(dev, "Limiting to %d VFs\n", - be_max_vfs(adapter)); - } - adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter)); } return 0; } +static void be_alloc_sriov_res(struct be_adapter *adapter) +{ + int old_vfs = pci_num_vf(adapter->pdev); + u16 num_vf_qs; + int status; + + be_get_sriov_config(adapter); + + if (!old_vfs) + pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter)); + + /* When the HW is in SRIOV capable configuration, the PF-pool + * resources are given to PF during driver load, if there are no + * old VFs. This facility is not available in BE3 FW. + * Also, this is done by FW in Lancer chip. + */ + if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) { + num_vf_qs = be_calculate_vf_qs(adapter, 0); + status = be_cmd_set_sriov_config(adapter, adapter->pool_res, 0, + num_vf_qs); + if (status) + dev_err(&adapter->pdev->dev, + "Failed to optimize SRIOV resources\n"); + } +} + static int be_get_resources(struct be_adapter *adapter) { struct device *dev = &adapter->pdev->dev; @@ -3731,12 +3792,23 @@ static int be_get_resources(struct be_adapter *adapter) if (status) return status; + /* If a deafault RXQ must be created, we'll use up one RSSQ*/ + if (res.max_rss_qs && res.max_rss_qs == res.max_rx_qs && + !(res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS)) + res.max_rss_qs -= 1; + /* If RoCE may be enabled stash away half the EQs for RoCE */ if (be_roce_supported(adapter)) res.max_evt_qs /= 2; adapter->res = res; } + /* If FW supports RSS default queue, then skip creating non-RSS + * queue for non-IP traffic. + */ + adapter->need_def_rxq = (be_if_cap_flags(adapter) & + BE_IF_FLAGS_DEFQ_RSS) ? 0 : 1; + dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n", be_max_txqs(adapter), be_max_rxqs(adapter), be_max_rss(adapter), be_max_eqs(adapter), @@ -3745,38 +3817,12 @@ static int be_get_resources(struct be_adapter *adapter) be_max_uc(adapter), be_max_mc(adapter), be_max_vlans(adapter)); + /* Sanitize cfg_num_qs based on HW and platform limits */ + adapter->cfg_num_qs = min_t(u16, netif_get_num_default_rss_queues(), + be_max_qs(adapter)); return 0; } -static void be_sriov_config(struct be_adapter *adapter) -{ - struct device *dev = &adapter->pdev->dev; - int status; - - status = be_get_sriov_config(adapter); - if (status) { - dev_err(dev, "Failed to query SR-IOV configuration\n"); - dev_err(dev, "SR-IOV cannot be enabled\n"); - return; - } - - /* When the HW is in SRIOV capable configuration, the PF-pool - * resources are equally distributed across the max-number of - * VFs. The user may request only a subset of the max-vfs to be - * enabled. Based on num_vfs, redistribute the resources across - * num_vfs so that each VF will have access to more number of - * resources. This facility is not available in BE3 FW. - * Also, this is done by FW in Lancer chip. - */ - if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) { - status = be_cmd_set_sriov_config(adapter, - adapter->pool_res, - adapter->num_vfs); - if (status) - dev_err(dev, "Failed to optimize SR-IOV resources\n"); - } -} - static int be_get_config(struct be_adapter *adapter) { int status, level; @@ -3807,9 +3853,6 @@ static int be_get_config(struct be_adapter *adapter) "Using profile 0x%x\n", profile_id); } - if (!BE2_chip(adapter) && be_physfn(adapter)) - be_sriov_config(adapter); - status = be_get_resources(adapter); if (status) return status; @@ -3819,9 +3862,6 @@ static int be_get_config(struct be_adapter *adapter) if (!adapter->pmac_id) return -ENOMEM; - /* Sanitize cfg_num_qs based on HW and platform limits */ - adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter)); - return 0; } @@ -3996,6 +4036,9 @@ static int be_setup(struct be_adapter *adapter) if (!lancer_chip(adapter)) be_cmd_req_native_mode(adapter); + if (!BE2_chip(adapter) && be_physfn(adapter)) + be_alloc_sriov_res(adapter); + status = be_get_config(adapter); if (status) goto err; @@ -5217,7 +5260,6 @@ static int be_drv_init(struct be_adapter *adapter) /* Must be a power of 2 or else MODULO will BUG_ON */ adapter->be_get_temp_freq = 64; - adapter->cfg_num_qs = netif_get_num_default_rss_queues(); return 0; @@ -5541,6 +5583,60 @@ err: dev_err(&adapter->pdev->dev, "EEH resume failed\n"); } +static int be_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + u16 num_vf_qs; + int status; + + if (!num_vfs) + be_vf_clear(adapter); + + adapter->num_vfs = num_vfs; + + if (adapter->num_vfs == 0 && pci_vfs_assigned(pdev)) { + dev_warn(&pdev->dev, + "Cannot disable VFs while they are assigned\n"); + return -EBUSY; + } + + /* When the HW is in SRIOV capable configuration, the PF-pool resources + * are equally distributed across the max-number of VFs. The user may + * request only a subset of the max-vfs to be enabled. + * Based on num_vfs, redistribute the resources across num_vfs so that + * each VF will have access to more number of resources. + * This facility is not available in BE3 FW. + * Also, this is done by FW in Lancer chip. + */ + if (skyhawk_chip(adapter) && !pci_num_vf(pdev)) { + num_vf_qs = be_calculate_vf_qs(adapter, adapter->num_vfs); + status = be_cmd_set_sriov_config(adapter, adapter->pool_res, + adapter->num_vfs, num_vf_qs); + if (status) + dev_err(&pdev->dev, + "Failed to optimize SR-IOV resources\n"); + } + + status = be_get_resources(adapter); + if (status) + return be_cmd_status(status); + + /* Updating real_num_tx/rx_queues() requires rtnl_lock() */ + rtnl_lock(); + status = be_update_queues(adapter); + rtnl_unlock(); + if (status) + return be_cmd_status(status); + + if (adapter->num_vfs) + status = be_vf_setup(adapter); + + if (!status) + return adapter->num_vfs; + + return 0; +} + static const struct pci_error_handlers be_eeh_handlers = { .error_detected = be_eeh_err_detected, .slot_reset = be_eeh_reset, @@ -5555,6 +5651,7 @@ static struct pci_driver be_driver = { .suspend = be_suspend, .resume = be_pci_resume, .shutdown = be_shutdown, + .sriov_configure = be_pci_sriov_configure, .err_handler = &be_eeh_handlers }; @@ -5568,6 +5665,11 @@ static int __init be_init_module(void) rx_frag_size = 2048; } + if (num_vfs > 0) { + pr_info(DRV_NAME " : Module param num_vfs is obsolete."); + pr_info(DRV_NAME " : Use sysfs method to enable VFs\n"); + } + return pci_register_driver(&be_driver); } module_init(be_init_module); diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 736d5d1624a1..7fb244f565b2 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -52,7 +52,12 @@ NETIF_MSG_RX_ERR| \ NETIF_MSG_TX_ERR) +#define SH_ETH_OFFSET_DEFAULTS \ + [0 ... SH_ETH_MAX_REGISTER_OFFSET - 1] = SH_ETH_OFFSET_INVALID + static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = { + SH_ETH_OFFSET_DEFAULTS, + [EDSR] = 0x0000, [EDMR] = 0x0400, [EDTRR] = 0x0408, @@ -132,9 +137,6 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = { [TSU_POST3] = 0x0078, [TSU_POST4] = 0x007c, [TSU_ADRH0] = 0x0100, - [TSU_ADRL0] = 0x0104, - [TSU_ADRH31] = 0x01f8, - [TSU_ADRL31] = 0x01fc, [TXNLCR0] = 0x0080, [TXALCR0] = 0x0084, @@ -151,6 +153,8 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = { }; static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = { + SH_ETH_OFFSET_DEFAULTS, + [EDSR] = 0x0000, [EDMR] = 0x0400, [EDTRR] = 0x0408, @@ -199,9 +203,6 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = { [TSU_ADSBSY] = 0x0060, [TSU_TEN] = 0x0064, [TSU_ADRH0] = 0x0100, - [TSU_ADRL0] = 0x0104, - [TSU_ADRH31] = 0x01f8, - [TSU_ADRL31] = 0x01fc, [TXNLCR0] = 0x0080, [TXALCR0] = 0x0084, @@ -210,6 +211,8 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = { }; static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = { + SH_ETH_OFFSET_DEFAULTS, + [ECMR] = 0x0300, [RFLR] = 0x0308, [ECSR] = 0x0310, @@ -256,6 +259,8 @@ static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = { }; static const u16 sh_eth_offset_fast_sh4[SH_ETH_MAX_REGISTER_OFFSET] = { + SH_ETH_OFFSET_DEFAULTS, + [ECMR] = 0x0100, [RFLR] = 0x0108, [ECSR] = 0x0110, @@ -308,6 +313,8 @@ static const u16 sh_eth_offset_fast_sh4[SH_ETH_MAX_REGISTER_OFFSET] = { }; static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = { + SH_ETH_OFFSET_DEFAULTS, + [EDMR] = 0x0000, [EDTRR] = 0x0004, [EDRRR] = 0x0008, @@ -392,8 +399,6 @@ static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = { [FWALCR1] = 0x00b4, [TSU_ADRH0] = 0x0100, - [TSU_ADRL0] = 0x0104, - [TSU_ADRL31] = 0x01fc, }; static void sh_eth_rcv_snd_disable(struct net_device *ndev); @@ -588,6 +593,7 @@ static struct sh_eth_cpu_data sh7757_data = { .no_ade = 1, .rpadir = 1, .rpadir_value = 2 << 16, + .rtrate = 1, }; #define SH_GIGA_ETH_BASE 0xfee00000UL @@ -1411,6 +1417,9 @@ static int sh_eth_txfree(struct net_device *ndev) break; /* TACT bit must be checked before all the following reads */ rmb(); + netif_info(mdp, tx_done, ndev, + "tx entry %d status 0x%08x\n", + entry, edmac_to_cpu(mdp, txdesc->status)); /* Free the original skb. */ if (mdp->tx_skbuff[entry]) { dma_unmap_single(&ndev->dev, txdesc->addr, @@ -1456,6 +1465,10 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) if (--boguscnt < 0) break; + netif_info(mdp, rx_status, ndev, + "rx entry %d status 0x%08x len %d\n", + entry, desc_status, pkt_len); + if (!(desc_status & RDFEND)) ndev->stats.rx_length_errors++; @@ -1500,6 +1513,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) netif_receive_skb(skb); ndev->stats.rx_packets++; ndev->stats.rx_bytes += pkt_len; + if (desc_status & RD_RFS8) + ndev->stats.multicast++; } entry = (++mdp->cur_rx) % mdp->num_rx_ring; rxdesc = &mdp->rx_ring[entry]; @@ -1542,7 +1557,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) /* If we don't need to check status, don't. -KDU */ if (!(sh_eth_read(ndev, EDRRR) & EDRRR_R)) { /* fix the values for the next receiving if RDE is set */ - if (intr_status & EESR_RDE && mdp->reg_offset[RDFAR] != 0) { + if (intr_status & EESR_RDE && + mdp->reg_offset[RDFAR] != SH_ETH_OFFSET_INVALID) { u32 count = (sh_eth_read(ndev, RDFAR) - sh_eth_read(ndev, RDLAR)) >> 4; @@ -1929,6 +1945,192 @@ error_exit: return ret; } +/* If it is ever necessary to increase SH_ETH_REG_DUMP_MAX_REGS, the + * version must be bumped as well. Just adding registers up to that + * limit is fine, as long as the existing register indices don't + * change. + */ +#define SH_ETH_REG_DUMP_VERSION 1 +#define SH_ETH_REG_DUMP_MAX_REGS 256 + +static size_t __sh_eth_get_regs(struct net_device *ndev, u32 *buf) +{ + struct sh_eth_private *mdp = netdev_priv(ndev); + struct sh_eth_cpu_data *cd = mdp->cd; + u32 *valid_map; + size_t len; + + BUILD_BUG_ON(SH_ETH_MAX_REGISTER_OFFSET > SH_ETH_REG_DUMP_MAX_REGS); + + /* Dump starts with a bitmap that tells ethtool which + * registers are defined for this chip. + */ + len = DIV_ROUND_UP(SH_ETH_REG_DUMP_MAX_REGS, 32); + if (buf) { + valid_map = buf; + buf += len; + } else { + valid_map = NULL; + } + + /* Add a register to the dump, if it has a defined offset. + * This automatically skips most undefined registers, but for + * some it is also necessary to check a capability flag in + * struct sh_eth_cpu_data. + */ +#define mark_reg_valid(reg) valid_map[reg / 32] |= 1U << (reg % 32) +#define add_reg_from(reg, read_expr) do { \ + if (mdp->reg_offset[reg] != SH_ETH_OFFSET_INVALID) { \ + if (buf) { \ + mark_reg_valid(reg); \ + *buf++ = read_expr; \ + } \ + ++len; \ + } \ + } while (0) +#define add_reg(reg) add_reg_from(reg, sh_eth_read(ndev, reg)) +#define add_tsu_reg(reg) add_reg_from(reg, sh_eth_tsu_read(mdp, reg)) + + add_reg(EDSR); + add_reg(EDMR); + add_reg(EDTRR); + add_reg(EDRRR); + add_reg(EESR); + add_reg(EESIPR); + add_reg(TDLAR); + add_reg(TDFAR); + add_reg(TDFXR); + add_reg(TDFFR); + add_reg(RDLAR); + add_reg(RDFAR); + add_reg(RDFXR); + add_reg(RDFFR); + add_reg(TRSCER); + add_reg(RMFCR); + add_reg(TFTR); + add_reg(FDR); + add_reg(RMCR); + add_reg(TFUCR); + add_reg(RFOCR); + if (cd->rmiimode) + add_reg(RMIIMODE); + add_reg(FCFTR); + if (cd->rpadir) + add_reg(RPADIR); + if (!cd->no_trimd) + add_reg(TRIMD); + add_reg(ECMR); + add_reg(ECSR); + add_reg(ECSIPR); + add_reg(PIR); + if (!cd->no_psr) + add_reg(PSR); + add_reg(RDMLR); + add_reg(RFLR); + add_reg(IPGR); + if (cd->apr) + add_reg(APR); + if (cd->mpr) + add_reg(MPR); + add_reg(RFCR); + add_reg(RFCF); + if (cd->tpauser) + add_reg(TPAUSER); + add_reg(TPAUSECR); + add_reg(GECMR); + if (cd->bculr) + add_reg(BCULR); + add_reg(MAHR); + add_reg(MALR); + add_reg(TROCR); + add_reg(CDCR); + add_reg(LCCR); + add_reg(CNDCR); + add_reg(CEFCR); + add_reg(FRECR); + add_reg(TSFRCR); + add_reg(TLFRCR); + add_reg(CERCR); + add_reg(CEECR); + add_reg(MAFCR); + if (cd->rtrate) + add_reg(RTRATE); + if (cd->hw_crc) + add_reg(CSMR); + if (cd->select_mii) + add_reg(RMII_MII); + add_reg(ARSTR); + if (cd->tsu) { + add_tsu_reg(TSU_CTRST); + add_tsu_reg(TSU_FWEN0); + add_tsu_reg(TSU_FWEN1); + add_tsu_reg(TSU_FCM); + add_tsu_reg(TSU_BSYSL0); + add_tsu_reg(TSU_BSYSL1); + add_tsu_reg(TSU_PRISL0); + add_tsu_reg(TSU_PRISL1); + add_tsu_reg(TSU_FWSL0); + add_tsu_reg(TSU_FWSL1); + add_tsu_reg(TSU_FWSLC); + add_tsu_reg(TSU_QTAG0); + add_tsu_reg(TSU_QTAG1); + add_tsu_reg(TSU_QTAGM0); + add_tsu_reg(TSU_QTAGM1); + add_tsu_reg(TSU_FWSR); + add_tsu_reg(TSU_FWINMK); + add_tsu_reg(TSU_ADQT0); + add_tsu_reg(TSU_ADQT1); + add_tsu_reg(TSU_VTAG0); + add_tsu_reg(TSU_VTAG1); + add_tsu_reg(TSU_ADSBSY); + add_tsu_reg(TSU_TEN); + add_tsu_reg(TSU_POST1); + add_tsu_reg(TSU_POST2); + add_tsu_reg(TSU_POST3); + add_tsu_reg(TSU_POST4); + if (mdp->reg_offset[TSU_ADRH0] != SH_ETH_OFFSET_INVALID) { + /* This is the start of a table, not just a single + * register. + */ + if (buf) { + unsigned int i; + + mark_reg_valid(TSU_ADRH0); + for (i = 0; i < SH_ETH_TSU_CAM_ENTRIES * 2; i++) + *buf++ = ioread32( + mdp->tsu_addr + + mdp->reg_offset[TSU_ADRH0] + + i * 4); + } + len += SH_ETH_TSU_CAM_ENTRIES * 2; + } + } + +#undef mark_reg_valid +#undef add_reg_from +#undef add_reg +#undef add_tsu_reg + + return len * 4; +} + +static int sh_eth_get_regs_len(struct net_device *ndev) +{ + return __sh_eth_get_regs(ndev, NULL); +} + +static void sh_eth_get_regs(struct net_device *ndev, struct ethtool_regs *regs, + void *buf) +{ + struct sh_eth_private *mdp = netdev_priv(ndev); + + regs->version = SH_ETH_REG_DUMP_VERSION; + + pm_runtime_get_sync(&mdp->pdev->dev); + __sh_eth_get_regs(ndev, buf); + pm_runtime_put_sync(&mdp->pdev->dev); +} + static int sh_eth_nway_reset(struct net_device *ndev) { struct sh_eth_private *mdp = netdev_priv(ndev); @@ -2074,6 +2276,8 @@ static int sh_eth_set_ringparam(struct net_device *ndev, static const struct ethtool_ops sh_eth_ethtool_ops = { .get_settings = sh_eth_get_settings, .set_settings = sh_eth_set_settings, + .get_regs_len = sh_eth_get_regs_len, + .get_regs = sh_eth_get_regs, .nway_reset = sh_eth_nway_reset, .get_msglevel = sh_eth_get_msglevel, .set_msglevel = sh_eth_set_msglevel, @@ -2213,6 +2417,22 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev) return NETDEV_TX_OK; } +/* The statistics registers have write-clear behaviour, which means we + * will lose any increment between the read and write. We mitigate + * this by only clearing when we read a non-zero value, so we will + * never falsely report a total of zero. + */ +static void +sh_eth_update_stat(struct net_device *ndev, unsigned long *stat, int reg) +{ + u32 delta = sh_eth_read(ndev, reg); + + if (delta) { + *stat += delta; + sh_eth_write(ndev, 0, reg); + } +} + static struct net_device_stats *sh_eth_get_stats(struct net_device *ndev) { struct sh_eth_private *mdp = netdev_priv(ndev); @@ -2223,21 +2443,18 @@ static struct net_device_stats *sh_eth_get_stats(struct net_device *ndev) if (!mdp->is_opened) return &ndev->stats; - ndev->stats.tx_dropped += sh_eth_read(ndev, TROCR); - sh_eth_write(ndev, 0, TROCR); /* (write clear) */ - ndev->stats.collisions += sh_eth_read(ndev, CDCR); - sh_eth_write(ndev, 0, CDCR); /* (write clear) */ - ndev->stats.tx_carrier_errors += sh_eth_read(ndev, LCCR); - sh_eth_write(ndev, 0, LCCR); /* (write clear) */ + sh_eth_update_stat(ndev, &ndev->stats.tx_dropped, TROCR); + sh_eth_update_stat(ndev, &ndev->stats.collisions, CDCR); + sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors, LCCR); if (sh_eth_is_gether(mdp)) { - ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CERCR); - sh_eth_write(ndev, 0, CERCR); /* (write clear) */ - ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CEECR); - sh_eth_write(ndev, 0, CEECR); /* (write clear) */ + sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors, + CERCR); + sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors, + CEECR); } else { - ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CNDCR); - sh_eth_write(ndev, 0, CNDCR); /* (write clear) */ + sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors, + CNDCR); } return &ndev->stats; diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index 259d03f353e1..06dbbe5201cb 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -32,6 +32,10 @@ #define SH_ETH_TSU_CAM_ENTRIES 32 enum { + /* IMPORTANT: To keep ethtool register dump working, add new + * register names immediately before SH_ETH_MAX_REGISTER_OFFSET. + */ + /* E-DMAC registers */ EDSR = 0, EDMR, @@ -131,9 +135,7 @@ enum { TSU_POST3, TSU_POST4, TSU_ADRH0, - TSU_ADRL0, - TSU_ADRH31, - TSU_ADRL31, + /* TSU_ADR{H,L}{0..31} are assumed to be contiguous */ TXNLCR0, TXALCR0, @@ -491,6 +493,7 @@ struct sh_eth_cpu_data { unsigned select_mii:1; /* EtherC have RMII_MII (MII select register) */ unsigned shift_rd0:1; /* shift Rx descriptor word 0 right by 16 */ unsigned rmiimode:1; /* EtherC has RMIIMODE register */ + unsigned rtrate:1; /* EtherC has RTRATE register */ }; struct sh_eth_private { @@ -543,19 +546,29 @@ static inline void sh_eth_soft_swap(char *src, int len) #endif } +#define SH_ETH_OFFSET_INVALID ((u16) ~0) + static inline void sh_eth_write(struct net_device *ndev, u32 data, int enum_index) { struct sh_eth_private *mdp = netdev_priv(ndev); + u16 offset = mdp->reg_offset[enum_index]; - iowrite32(data, mdp->addr + mdp->reg_offset[enum_index]); + if (WARN_ON(offset == SH_ETH_OFFSET_INVALID)) + return; + + iowrite32(data, mdp->addr + offset); } static inline u32 sh_eth_read(struct net_device *ndev, int enum_index) { struct sh_eth_private *mdp = netdev_priv(ndev); + u16 offset = mdp->reg_offset[enum_index]; + + if (WARN_ON(offset == SH_ETH_OFFSET_INVALID)) + return ~0U; - return ioread32(mdp->addr + mdp->reg_offset[enum_index]); + return ioread32(mdp->addr + offset); } static inline void *sh_eth_tsu_get_offset(struct sh_eth_private *mdp, diff --git a/include/linux/socket.h b/include/linux/socket.h index 5c19cba34dce..fab4d0ddf4ed 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -181,6 +181,7 @@ struct ucred { #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ +#define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ @@ -226,6 +227,7 @@ struct ucred { #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB +#define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH diff --git a/include/net/arp.h b/include/net/arp.h index 21ee1860abbc..5e0f891d476c 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -9,28 +9,17 @@ extern struct neigh_table arp_tbl; -static inline u32 arp_hashfn(u32 key, const struct net_device *dev, u32 hash_rnd) +static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { + u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); - return val * hash_rnd; + return val * hash_rnd[0]; } static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { - struct neigh_hash_table *nht = rcu_dereference_bh(arp_tbl.nht); - struct neighbour *n; - u32 hash_val; - - hash_val = arp_hashfn(key, dev, nht->hash_rnd[0]) >> (32 - nht->hash_shift); - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (n->dev == dev && *(u32 *)n->primary_key == key) - return n; - } - - return NULL; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 6bbda34d5e59..b3a7751251b4 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -156,24 +156,7 @@ static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, _ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey) { - struct neigh_hash_table *nht; - const u32 *p32 = pkey; - struct neighbour *n; - u32 hash_val; - - nht = rcu_dereference_bh(nd_tbl.nht); - hash_val = ndisc_hashfn(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - u32 *n32 = (u32 *) n->primary_key; - if (n->dev == dev && - ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | - (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0) - return n; - } - - return NULL; + return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9f912e4d4232..afb8237b0a8c 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -197,6 +197,7 @@ struct neigh_table { __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); + bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); @@ -247,6 +248,57 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 + +static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) +{ + return *(const u16 *)n->primary_key == *(const u16 *)pkey; +} + +static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) +{ + return *(const u32 *)n->primary_key == *(const u32 *)pkey; +} + +static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) +{ + const u32 *n32 = (const u32 *)n->primary_key; + const u32 *p32 = pkey; + + return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | + (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; +} + +static inline struct neighbour *___neigh_lookup_noref( + struct neigh_table *tbl, + bool (*key_eq)(const struct neighbour *n, const void *pkey), + __u32 (*hash)(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd), + const void *pkey, + struct net_device *dev) +{ + struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); + struct neighbour *n; + u32 hash_val; + + hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (n->dev == dev && key_eq(n, pkey)) + return n; + } + + return NULL; +} + +static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev) +{ + return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); +} + void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -306,6 +358,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); +int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_entry *)); @@ -459,4 +512,6 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } + + #endif diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 36faf4990c4b..2cb9acb618e9 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -26,6 +26,7 @@ #endif #include <net/netns/nftables.h> #include <net/netns/xfrm.h> +#include <net/netns/mpls.h> #include <linux/ns_common.h> struct user_namespace; @@ -130,6 +131,9 @@ struct net { #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; #endif +#if IS_ENABLED(CONFIG_MPLS) + struct netns_mpls mpls; +#endif struct sock *diag_nlsk; atomic_t fnhe_genid; }; diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h new file mode 100644 index 000000000000..d29203651c01 --- /dev/null +++ b/include/net/netns/mpls.h @@ -0,0 +1,17 @@ +/* + * mpls in net namespaces + */ + +#ifndef __NETNS_MPLS_H__ +#define __NETNS_MPLS_H__ + +struct mpls_route; +struct ctl_table_header; + +struct netns_mpls { + size_t platform_labels; + struct mpls_route __rcu * __rcu *platform_label; + struct ctl_table_header *ctl; +}; + +#endif /* __NETNS_MPLS_H__ */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5cc5d66bf519..06f75a407f74 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -303,6 +303,8 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_VIA, + RTA_NEWDST, __RTA_MAX }; @@ -344,6 +346,12 @@ struct rtnexthop { #define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) #define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) +/* RTA_VIA */ +struct rtvia { + __kernel_sa_family_t rtvia_family; + __u8 rtvia_addr[0]; +}; + /* RTM_CACHEINFO */ struct rta_cacheinfo { @@ -623,6 +631,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF RTNLGRP_MDB, #define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/Makefile b/net/Makefile index 38704bdf941a..3995613e5510 100644 --- a/net/Makefile +++ b/net/Makefile @@ -69,7 +69,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ -obj-$(CONFIG_NET_MPLS_GSO) += mpls/ +obj-$(CONFIG_MPLS) += mpls/ obj-$(CONFIG_HSR) += hsr/ ifneq ($(CONFIG_NET_SWITCHDEV),) obj-y += switchdev/ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 0f48ea3affed..cffaf00561e7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -397,25 +397,15 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; - int key_len = tbl->key_len; - u32 hash_val; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - if (!atomic_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } + n = __neigh_lookup_noref(tbl, pkey, dev); + if (n) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock_bh(); @@ -2401,6 +2391,40 @@ void __neigh_for_each_release(struct neigh_table *tbl, } EXPORT_SYMBOL(__neigh_for_each_release); +int neigh_xmit(int family, struct net_device *dev, + const void *addr, struct sk_buff *skb) +{ + int err; + if (family == AF_PACKET) { + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + addr, NULL, skb->len); + if (err < 0) + goto out_kfree_skb; + err = dev_queue_xmit(skb); + } else { + struct neigh_table *tbl; + struct neighbour *neigh; + + err = -ENETDOWN; + tbl = neigh_find_table(family); + if (!tbl) + goto out; + neigh = __neigh_lookup_noref(tbl, addr, dev); + if (!neigh) + neigh = __neigh_create(tbl, addr, dev, false); + err = PTR_ERR(neigh); + if (IS_ERR(neigh)) + goto out_kfree_skb; + err = neigh->output(neigh, skb); + } +out: + return err; +out_kfree_skb: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_xmit); + #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_first(struct seq_file *seq) diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index f123c6c6748c..ee7d1cef0027 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -93,12 +93,18 @@ static u32 dn_neigh_hash(const void *pkey, return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]); } +static bool dn_key_eq(const struct neighbour *neigh, const void *pkey) +{ + return neigh_key_eq16(neigh, pkey); +} + struct neigh_table dn_neigh_table = { .family = PF_DECnet, .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)), .key_len = sizeof(__le16), .protocol = cpu_to_be16(ETH_P_DNA_RT), .hash = dn_neigh_hash, + .key_eq = dn_key_eq, .constructor = dn_neigh_construct, .id = "dn_neigh_cache", .parms ={ diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6b8aad6a0d7d..5f5c674e130a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -122,6 +122,7 @@ * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); +static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -154,6 +155,7 @@ struct neigh_table arp_tbl = { .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, + .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .id = "arp_cache", @@ -209,7 +211,12 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { - return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); + return arp_hashfn(pkey, dev, hash_rnd); +} + +static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) +{ + return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e363bbc2420d..247ad7c298f7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -84,6 +84,7 @@ do { \ static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); +static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -119,6 +120,7 @@ struct neigh_table nd_tbl = { .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, + .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, @@ -295,6 +297,11 @@ static u32 ndisc_hash(const void *pkey, return ndisc_hashfn(pkey, dev, hash_rnd); } +static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) +{ + return neigh_key_eq128(n, pkey); +} + static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 37421db88965..f4286ee7e2b0 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -1,9 +1,30 @@ # # MPLS configuration # + +menuconfig MPLS + tristate "MultiProtocol Label Switching" + default n + ---help--- + MultiProtocol Label Switching routes packets through logical + circuits. Originally conceved as a way of routing packets at + hardware speeds (before hardware was capable of routing ipv4 packets), + MPLS remains as simple way of making tunnels. + + If you have not heard of MPLS you probably want to say N here. + +if MPLS + config NET_MPLS_GSO - tristate "MPLS: GSO support" + bool "MPLS: GSO support" help This is helper module to allow segmentation of non-MPLS GSO packets that have had MPLS stack entries pushed onto them and thus become MPLS GSO packets. + +config MPLS_ROUTING + bool "MPLS: routing support" + help + Add support for forwarding of mpls packets. + +endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 6dec088c2d0f..60af15f1960e 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -2,3 +2,4 @@ # Makefile for MPLS. # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o +obj-$(CONFIG_MPLS_ROUTING) += af_mpls.o diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c new file mode 100644 index 000000000000..23e51d13b0ff --- /dev/null +++ b/net/mpls/af_mpls.c @@ -0,0 +1,974 @@ +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/sysctl.h> +#include <linux/net.h> +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/mpls.h> +#include <net/ip.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/ip_fib.h> +#include <net/netevent.h> +#include <net/netns/generic.h> +#include "internal.h" + +#define LABEL_NOT_SPECIFIED (1<<20) +#define MAX_NEW_LABELS 2 + +/* This maximum ha length copied from the definition of struct neighbour */ +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) + +struct mpls_route { /* next hop label forwarding entry */ + struct net_device *rt_dev; + struct rcu_head rt_rcu; + u32 rt_label[MAX_NEW_LABELS]; + u8 rt_protocol; /* routing protocol that set this entry */ + u8 rt_labels:2, + rt_via_alen:6; + unsigned short rt_via_family; + u8 rt_via[0]; +}; + +static int zero = 0; +static int label_limit = (1 << 20) - 1; + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags); + +static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) +{ + struct mpls_route *rt = NULL; + + if (index < net->mpls.platform_labels) { + struct mpls_route __rcu **platform_label = + rcu_dereference(net->mpls.platform_label); + rt = rcu_dereference(platform_label[index]); + } + return rt; +} + +static bool mpls_output_possible(const struct net_device *dev) +{ + return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); +} + +static unsigned int mpls_rt_header_size(const struct mpls_route *rt) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return rt->rt_labels * sizeof(struct mpls_shim_hdr); +} + +static unsigned int mpls_dev_mtu(const struct net_device *dev) +{ + /* The amount of data the layer 2 frame can hold */ + return dev->mtu; +} + +static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, + struct mpls_entry_decoded dec) +{ + /* RFC4385 and RFC5586 encode other packets in mpls such that + * they don't conflict with the ip version number, making + * decoding by examining the ip version correct in everything + * except for the strangest cases. + * + * The strange cases if we choose to support them will require + * manual configuration. + */ + struct iphdr *hdr4 = ip_hdr(skb); + bool success = true; + + if (hdr4->version == 4) { + skb->protocol = htons(ETH_P_IP); + csum_replace2(&hdr4->check, + htons(hdr4->ttl << 8), + htons(dec.ttl << 8)); + hdr4->ttl = dec.ttl; + } + else if (hdr4->version == 6) { + struct ipv6hdr *hdr6 = ipv6_hdr(skb); + skb->protocol = htons(ETH_P_IPV6); + hdr6->hop_limit = dec.ttl; + } + else + /* version 0 and version 1 are used by pseudo wires */ + success = false; + return success; +} + +static int mpls_forward(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mpls_shim_hdr *hdr; + struct mpls_route *rt; + struct mpls_entry_decoded dec; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + int err; + + /* Careful this entire function runs inside of an rcu critical section */ + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*hdr))) + goto drop; + + /* Read and decode the label */ + hdr = mpls_hdr(skb); + dec = mpls_entry_decode(hdr); + + /* Pop the label */ + skb_pull(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + + skb_orphan(skb); + + rt = mpls_route_input_rcu(net, dec.label); + if (!rt) + goto drop; + + /* Find the output device */ + out_dev = rt->rt_dev; + if (!mpls_output_possible(out_dev)) + goto drop; + + if (skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + /* Verify ttl is valid */ + if (dec.ttl <= 2) + goto drop; + dec.ttl -= 1; + + /* Verify the destination can hold the packet */ + new_header_size = mpls_rt_header_size(rt); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + if (unlikely(!new_header_size && dec.bos)) { + /* Penultimate hop popping */ + if (!mpls_egress(rt, skb, dec)) + goto drop; + } else { + bool bos; + int i; + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = dec.bos; + for (i = rt->rt_labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos); + bos = false; + } + } + + err = neigh_xmit(rt->rt_via_family, out_dev, rt->rt_via, skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + return 0; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mpls_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_MPLS_UC), + .func = mpls_forward, +}; + +static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { + [RTA_DST] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +struct mpls_route_config { + u32 rc_protocol; + u32 rc_ifindex; + u16 rc_via_family; + u16 rc_via_alen; + u8 rc_via[MAX_VIA_ALEN]; + u32 rc_label; + u32 rc_output_labels; + u32 rc_output_label[MAX_NEW_LABELS]; + u32 rc_nlflags; + struct nl_info rc_nlinfo; +}; + +static struct mpls_route *mpls_rt_alloc(size_t alen) +{ + struct mpls_route *rt; + + rt = kzalloc(GFP_KERNEL, sizeof(*rt) + alen); + if (rt) + rt->rt_via_alen = alen; + return rt; +} + +static void mpls_rt_free(struct mpls_route *rt) +{ + if (rt) + kfree_rcu(rt, rt_rcu); +} + +static void mpls_notify_route(struct net *net, unsigned index, + struct mpls_route *old, struct mpls_route *new, + const struct nl_info *info) +{ + struct nlmsghdr *nlh = info ? info->nlh : NULL; + unsigned portid = info ? info->portid : 0; + int event = new ? RTM_NEWROUTE : RTM_DELROUTE; + struct mpls_route *rt = new ? new : old; + unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; + /* Ignore reserved labels for now */ + if (rt && (index >= 16)) + rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); +} + +static void mpls_route_update(struct net *net, unsigned index, + struct net_device *dev, struct mpls_route *new, + const struct nl_info *info) +{ + struct mpls_route *rt, *old = NULL; + + ASSERT_RTNL(); + + rt = net->mpls.platform_label[index]; + if (!dev || (rt && (rt->rt_dev == dev))) { + rcu_assign_pointer(net->mpls.platform_label[index], new); + old = rt; + } + + mpls_notify_route(net, index, old, new, info); + + /* If we removed a route free it now */ + mpls_rt_free(old); +} + +static unsigned find_free_label(struct net *net) +{ + unsigned index; + for (index = 16; index < net->mpls.platform_labels; index++) { + if (!net->mpls.platform_label[index]) + return index; + } + return LABEL_NOT_SPECIFIED; +} + +static int mpls_route_add(struct mpls_route_config *cfg) +{ + struct net *net = cfg->rc_nlinfo.nl_net; + struct net_device *dev = NULL; + struct mpls_route *rt, *old; + unsigned index; + int i; + int err = -EINVAL; + + index = cfg->rc_label; + + /* If a label was not specified during insert pick one */ + if ((index == LABEL_NOT_SPECIFIED) && + (cfg->rc_nlflags & NLM_F_CREATE)) { + index = find_free_label(net); + } + + /* The first 16 labels are reserved, and may not be set */ + if (index < 16) + goto errout; + + /* The full 20 bit range may not be supported. */ + if (index >= net->mpls.platform_labels) + goto errout; + + /* Ensure only a supported number of labels are present */ + if (cfg->rc_output_labels > MAX_NEW_LABELS) + goto errout; + + err = -ENODEV; + dev = dev_get_by_index(net, cfg->rc_ifindex); + if (!dev) + goto errout; + + /* For now just support ethernet devices */ + err = -EINVAL; + if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) + goto errout; + + err = -EINVAL; + if ((cfg->rc_via_family == AF_PACKET) && + (dev->addr_len != cfg->rc_via_alen)) + goto errout; + + /* Append makes no sense with mpls */ + err = -EINVAL; + if (cfg->rc_nlflags & NLM_F_APPEND) + goto errout; + + err = -EEXIST; + old = net->mpls.platform_label[index]; + if ((cfg->rc_nlflags & NLM_F_EXCL) && old) + goto errout; + + err = -EEXIST; + if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old) + goto errout; + + err = -ENOENT; + if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) + goto errout; + + err = -ENOMEM; + rt = mpls_rt_alloc(cfg->rc_via_alen); + if (!rt) + goto errout; + + rt->rt_labels = cfg->rc_output_labels; + for (i = 0; i < rt->rt_labels; i++) + rt->rt_label[i] = cfg->rc_output_label[i]; + rt->rt_protocol = cfg->rc_protocol; + rt->rt_dev = dev; + rt->rt_via_family = cfg->rc_via_family; + memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen); + + mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo); + + dev_put(dev); + return 0; + +errout: + if (dev) + dev_put(dev); + return err; +} + +static int mpls_route_del(struct mpls_route_config *cfg) +{ + struct net *net = cfg->rc_nlinfo.nl_net; + unsigned index; + int err = -EINVAL; + + index = cfg->rc_label; + + /* The first 16 labels are reserved, and may not be removed */ + if (index < 16) + goto errout; + + /* The full 20 bit range may not be supported */ + if (index >= net->mpls.platform_labels) + goto errout; + + mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo); + + err = 0; +errout: + return err; +} + +static void mpls_ifdown(struct net_device *dev) +{ + struct net *net = dev_net(dev); + unsigned index; + + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = net->mpls.platform_label[index]; + if (!rt) + continue; + if (rt->rt_dev != dev) + continue; + rt->rt_dev = NULL; + } +} + +static int mpls_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch(event) { + case NETDEV_UNREGISTER: + mpls_ifdown(dev); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mpls_dev_notifier = { + .notifier_call = mpls_dev_notify, +}; + +static int nla_put_via(struct sk_buff *skb, + u16 family, const void *addr, int alen) +{ + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + return -EMSGSIZE; + + via = nla_data(nla); + via->rtvia_family = family; + memcpy(via->rtvia_addr, addr, alen); + return 0; +} + +int nla_put_labels(struct sk_buff *skb, int attrtype, + u8 labels, const u32 label[]) +{ + struct nlattr *nla; + struct mpls_shim_hdr *nla_label; + bool bos; + int i; + nla = nla_reserve(skb, attrtype, labels*4); + if (!nla) + return -EMSGSIZE; + + nla_label = nla_data(nla); + bos = true; + for (i = labels - 1; i >= 0; i--) { + nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos); + bos = false; + } + + return 0; +} + +int nla_get_labels(const struct nlattr *nla, + u32 max_labels, u32 *labels, u32 label[]) +{ + unsigned len = nla_len(nla); + unsigned nla_labels; + struct mpls_shim_hdr *nla_label; + bool bos; + int i; + + /* len needs to be an even multiple of 4 (the label size) */ + if (len & 3) + return -EINVAL; + + /* Limit the number of new labels allowed */ + nla_labels = len/4; + if (nla_labels > max_labels) + return -EINVAL; + + nla_label = nla_data(nla); + bos = true; + for (i = nla_labels - 1; i >= 0; i--, bos = false) { + struct mpls_entry_decoded dec; + dec = mpls_entry_decode(nla_label + i); + + /* Ensure the bottom of stack flag is properly set + * and ttl and tc are both clear. + */ + if ((dec.bos != bos) || dec.ttl || dec.tc) + return -EINVAL; + + label[i] = dec.label; + } + *labels = nla_labels; + return 0; +} + +static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct mpls_route_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int index; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + if (rtm->rtm_family != AF_MPLS) + goto errout; + if (rtm->rtm_dst_len != 20) + goto errout; + if (rtm->rtm_src_len != 0) + goto errout; + if (rtm->rtm_tos != 0) + goto errout; + if (rtm->rtm_table != RT_TABLE_MAIN) + goto errout; + /* Any value is acceptable for rtm_protocol */ + + /* As mpls uses destination specific addresses + * (or source specific address in the case of multicast) + * all addresses have universal scope. + */ + if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) + goto errout; + if (rtm->rtm_type != RTN_UNICAST) + goto errout; + if (rtm->rtm_flags != 0) + goto errout; + + cfg->rc_label = LABEL_NOT_SPECIFIED; + cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_nlflags = nlh->nlmsg_flags; + cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; + cfg->rc_nlinfo.nlh = nlh; + cfg->rc_nlinfo.nl_net = sock_net(skb->sk); + + for (index = 0; index <= RTA_MAX; index++) { + struct nlattr *nla = tb[index]; + if (!nla) + continue; + + switch(index) { + case RTA_OIF: + cfg->rc_ifindex = nla_get_u32(nla); + break; + case RTA_NEWDST: + if (nla_get_labels(nla, MAX_NEW_LABELS, + &cfg->rc_output_labels, + cfg->rc_output_label)) + goto errout; + break; + case RTA_DST: + { + u32 label_count; + if (nla_get_labels(nla, 1, &label_count, + &cfg->rc_label)) + goto errout; + + /* The first 16 labels are reserved, and may not be set */ + if (cfg->rc_label < 16) + goto errout; + + break; + } + case RTA_VIA: + { + struct rtvia *via = nla_data(nla); + cfg->rc_via_family = via->rtvia_family; + cfg->rc_via_alen = nla_len(nla) - 2; + if (cfg->rc_via_alen > MAX_VIA_ALEN) + goto errout; + + /* Validate the address family */ + switch(cfg->rc_via_family) { + case AF_PACKET: + break; + case AF_INET: + if (cfg->rc_via_alen != 4) + goto errout; + break; + case AF_INET6: + if (cfg->rc_via_alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); + break; + } + default: + /* Unsupported attribute */ + goto errout; + } + } + + err = 0; +errout: + return err; +} + +static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_del(&cfg); +} + + +static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_add(&cfg); +} + +static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, + u32 label, struct mpls_route *rt, int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_MPLS; + rtm->rtm_dst_len = 20; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = rt->rt_protocol; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + + if (rt->rt_labels && + nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) + goto nla_put_failure; + if (nla_put_via(skb, rt->rt_via_family, rt->rt_via, rt->rt_via_alen)) + goto nla_put_failure; + if (rt->rt_dev && nla_put_u32(skb, RTA_OIF, rt->rt_dev->ifindex)) + goto nla_put_failure; + if (nla_put_labels(skb, RTA_DST, 1, &label)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int index; + + ASSERT_RTNL(); + + index = cb->args[0]; + if (index < 16) + index = 16; + + for (; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt; + rt = net->mpls.platform_label[index]; + if (!rt) + continue; + + if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + index, rt, NLM_F_MULTI) < 0) + break; + } + cb->args[0] = index; + + return skb->len; +} + +static inline size_t lfib_nlmsg_size(struct mpls_route *rt) +{ + size_t payload = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ + + nla_total_size(4); /* RTA_DST */ + if (rt->rt_labels) /* RTA_NEWDST */ + payload += nla_total_size(rt->rt_labels * 4); + if (rt->rt_dev) /* RTA_OIF */ + payload += nla_total_size(4); + return payload; +} + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags) +{ + struct sk_buff *skb; + u32 seq = nlh ? nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); + + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); +} + +static int resize_platform_label_table(struct net *net, size_t limit) +{ + size_t size = sizeof(struct mpls_route *) * limit; + size_t old_limit; + size_t cp_size; + struct mpls_route __rcu **labels = NULL, **old; + struct mpls_route *rt0 = NULL, *rt2 = NULL; + unsigned index; + + if (size) { + labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!labels) + labels = vzalloc(size); + + if (!labels) + goto nolabels; + } + + /* In case the predefined labels need to be populated */ + if (limit > LABEL_IPV4_EXPLICIT_NULL) { + struct net_device *lo = net->loopback_dev; + rt0 = mpls_rt_alloc(lo->addr_len); + if (!rt0) + goto nort0; + rt0->rt_dev = lo; + rt0->rt_protocol = RTPROT_KERNEL; + rt0->rt_via_family = AF_PACKET; + memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); + } + if (limit > LABEL_IPV6_EXPLICIT_NULL) { + struct net_device *lo = net->loopback_dev; + rt2 = mpls_rt_alloc(lo->addr_len); + if (!rt2) + goto nort2; + rt2->rt_dev = lo; + rt2->rt_protocol = RTPROT_KERNEL; + rt2->rt_via_family = AF_PACKET; + memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len); + } + + rtnl_lock(); + /* Remember the original table */ + old = net->mpls.platform_label; + old_limit = net->mpls.platform_labels; + + /* Free any labels beyond the new table */ + for (index = limit; index < old_limit; index++) + mpls_route_update(net, index, NULL, NULL, NULL); + + /* Copy over the old labels */ + cp_size = size; + if (old_limit < limit) + cp_size = old_limit * sizeof(struct mpls_route *); + + memcpy(labels, old, cp_size); + + /* If needed set the predefined labels */ + if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) && + (limit > LABEL_IPV6_EXPLICIT_NULL)) { + labels[LABEL_IPV6_EXPLICIT_NULL] = rt2; + rt2 = NULL; + } + + if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) && + (limit > LABEL_IPV4_EXPLICIT_NULL)) { + labels[LABEL_IPV4_EXPLICIT_NULL] = rt0; + rt0 = NULL; + } + + /* Update the global pointers */ + net->mpls.platform_labels = limit; + net->mpls.platform_label = labels; + + rtnl_unlock(); + + mpls_rt_free(rt2); + mpls_rt_free(rt0); + + if (old) { + synchronize_rcu(); + kvfree(old); + } + return 0; + +nort2: + mpls_rt_free(rt0); +nort0: + kvfree(labels); +nolabels: + return -ENOMEM; +} + +static int mpls_platform_labels(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = table->data; + int platform_labels = net->mpls.platform_labels; + int ret; + struct ctl_table tmp = { + .procname = table->procname, + .data = &platform_labels, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &label_limit, + }; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = resize_platform_label_table(net, platform_labels); + + return ret; +} + +static struct ctl_table mpls_table[] = { + { + .procname = "platform_labels", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = mpls_platform_labels, + }, + { } +}; + +static int mpls_net_init(struct net *net) +{ + struct ctl_table *table; + + net->mpls.platform_labels = 0; + net->mpls.platform_label = NULL; + + table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); + if (table == NULL) + return -ENOMEM; + + table[0].data = net; + net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); + if (net->mpls.ctl == NULL) + return -ENOMEM; + + return 0; +} + +static void mpls_net_exit(struct net *net) +{ + struct ctl_table *table; + unsigned int index; + + table = net->mpls.ctl->ctl_table_arg; + unregister_net_sysctl_table(net->mpls.ctl); + kfree(table); + + /* An rcu grace period haselapsed since there was a device in + * the network namespace (and thus the last in fqlight packet) + * left this network namespace. This is because + * unregister_netdevice_many and netdev_run_todo has completed + * for each network device that was in this network namespace. + * + * As such no additional rcu synchronization is necessary when + * freeing the platform_label table. + */ + rtnl_lock(); + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = net->mpls.platform_label[index]; + rcu_assign_pointer(net->mpls.platform_label[index], NULL); + mpls_rt_free(rt); + } + rtnl_unlock(); + + kvfree(net->mpls.platform_label); +} + +static struct pernet_operations mpls_net_ops = { + .init = mpls_net_init, + .exit = mpls_net_exit, +}; + +static int __init mpls_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); + + err = register_pernet_subsys(&mpls_net_ops); + if (err) + goto out; + + err = register_netdevice_notifier(&mpls_dev_notifier); + if (err) + goto out_unregister_pernet; + + dev_add_pack(&mpls_packet_type); + + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); + err = 0; +out: + return err; + +out_unregister_pernet: + unregister_pernet_subsys(&mpls_net_ops); + goto out; +} +module_init(mpls_init); + +static void __exit mpls_exit(void) +{ + rtnl_unregister_all(PF_MPLS); + dev_remove_pack(&mpls_packet_type); + unregister_netdevice_notifier(&mpls_dev_notifier); + unregister_pernet_subsys(&mpls_net_ops); +} +module_exit(mpls_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_NETPROTO(PF_MPLS); diff --git a/net/mpls/internal.h b/net/mpls/internal.h new file mode 100644 index 000000000000..fb6de92052c4 --- /dev/null +++ b/net/mpls/internal.h @@ -0,0 +1,59 @@ +#ifndef MPLS_INTERNAL_H +#define MPLS_INTERNAL_H + +#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ +#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */ +#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ +#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */ +#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define LABEL_GAL 13 /* RFC5586 */ +#define LABEL_OAM_ALERT 14 /* RFC3429 */ +#define LABEL_EXTENSION 15 /* RFC7274 */ + + +struct mpls_shim_hdr { + __be32 label_stack_entry; +}; + +struct mpls_entry_decoded { + u32 label; + u8 ttl; + u8 tc; + u8 bos; +}; + +struct sk_buff; + +static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) +{ + return (struct mpls_shim_hdr *)skb_network_header(skb); +} + +static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) +{ + struct mpls_shim_hdr result; + result.label_stack_entry = + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | + (tc << MPLS_LS_TC_SHIFT) | + (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | + (ttl << MPLS_LS_TTL_SHIFT)); + return result; +} + +static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) +{ + struct mpls_entry_decoded result; + unsigned entry = be32_to_cpu(hdr->label_stack_entry); + + result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; + + return result; +} + +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); + +#endif /* MPLS_INTERNAL_H */ |