summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/mpls-sysctl.txt20
-rw-r--r--drivers/net/ethernet/cadence/Kconfig4
-rw-r--r--drivers/net/ethernet/emulex/benet/be.h9
-rw-r--r--drivers/net/ethernet/emulex/benet/be_cmds.c114
-rw-r--r--drivers/net/ethernet/emulex/benet/be_cmds.h23
-rw-r--r--drivers/net/ethernet/emulex/benet/be_ethtool.c2
-rw-r--r--drivers/net/ethernet/emulex/benet/be_main.c260
-rw-r--r--drivers/net/ethernet/renesas/sh_eth.c259
-rw-r--r--drivers/net/ethernet/renesas/sh_eth.h23
-rw-r--r--include/linux/socket.h2
-rw-r--r--include/net/arp.h19
-rw-r--r--include/net/ndisc.h19
-rw-r--r--include/net/neighbour.h55
-rw-r--r--include/net/net_namespace.h4
-rw-r--r--include/net/netns/mpls.h17
-rw-r--r--include/uapi/linux/rtnetlink.h10
-rw-r--r--net/Makefile2
-rw-r--r--net/core/neighbour.c54
-rw-r--r--net/decnet/dn_neigh.c6
-rw-r--r--net/ipv4/arp.c9
-rw-r--r--net/ipv6/ndisc.c7
-rw-r--r--net/mpls/Kconfig23
-rw-r--r--net/mpls/Makefile1
-rw-r--r--net/mpls/af_mpls.c974
-rw-r--r--net/mpls/internal.h59
25 files changed, 1773 insertions, 202 deletions
diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
new file mode 100644
index 000000000000..639ddf0ece9b
--- /dev/null
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -0,0 +1,20 @@
+/proc/sys/net/mpls/* Variables:
+
+platform_labels - INTEGER
+ Number of entries in the platform label table. It is not
+ possible to configure forwarding for label values equal to or
+ greater than the number of platform labels.
+
+ A dense utliziation of the entries in the platform label table
+ is possible and expected aas the platform labels are locally
+ allocated.
+
+ If the number of platform label table entries is set to 0 no
+ label will be recognized by the kernel and mpls forwarding
+ will be disabled.
+
+ Reducing this value will remove all label routing entries that
+ no longer fit in the table.
+
+ Possible values: 0 - 1048575
+ Default: 0
diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig
index 321d2ad235d9..739bb0048ebf 100644
--- a/drivers/net/ethernet/cadence/Kconfig
+++ b/drivers/net/ethernet/cadence/Kconfig
@@ -4,7 +4,7 @@
config NET_CADENCE
bool "Cadence devices"
- depends on HAS_IOMEM && (ARM || AVR32 || MICROBLAZE || COMPILE_TEST)
+ depends on HAS_IOMEM
default y
---help---
If you have a network (Ethernet) card belonging to this class, say Y.
@@ -30,7 +30,7 @@ config ARM_AT91_ETHER
config MACB
tristate "Cadence MACB/GEM support"
- depends on HAS_DMA && (PLATFORM_AT32AP || ARCH_AT91 || ARCH_PICOXCELL || ARCH_ZYNQ || MICROBLAZE || COMPILE_TEST)
+ depends on HAS_DMA
select PHYLIB
---help---
The Cadence MACB ethernet interface is found on many Atmel AT32 and
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index fac806a15a61..996bbc6a244f 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -87,6 +87,7 @@
#define BE3_MAX_EVT_QS 16
#define BE3_SRIOV_MAX_EVT_QS 8
+#define MAX_RSS_IFACES 15
#define MAX_RX_QS 32
#define MAX_EVT_QS 32
#define MAX_TX_QS 32
@@ -411,8 +412,11 @@ struct be_resources {
u16 max_tx_qs;
u16 max_rss_qs;
u16 max_rx_qs;
+ u16 max_cq_count;
u16 max_uc_mac; /* Max UC MACs programmable */
u16 max_vlans; /* Number of vlans supported */
+ u16 max_iface_count;
+ u16 max_mcc_count;
u16 max_evt_qs;
u32 if_cap_flags;
u32 vf_if_cap_flags; /* VF if capability flags */
@@ -488,6 +492,8 @@ struct be_adapter {
/* Rx rings */
u16 num_rx_qs;
+ u16 num_rss_qs;
+ u16 need_def_rxq;
struct be_rx_obj rx_obj[MAX_RX_QS];
u32 big_page_size; /* Compounded page size shared by rx wrbs */
@@ -635,9 +641,8 @@ extern const struct ethtool_ops be_ethtool_ops;
for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs; \
i++, rxo++)
-/* Skip the default non-rss queue (last one)*/
#define for_all_rss_queues(adapter, rxo, i) \
- for (i = 0, rxo = &adapter->rx_obj[i]; i < (adapter->num_rx_qs - 1);\
+ for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rss_qs; \
i++, rxo++)
#define for_all_tx_queues(adapter, txo, i) \
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index be00695b3be7..75cb4610423b 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -3577,12 +3577,12 @@ static void be_copy_nic_desc(struct be_resources *res,
res->max_rss_qs = le16_to_cpu(desc->rssq_count);
res->max_rx_qs = le16_to_cpu(desc->rq_count);
res->max_evt_qs = le16_to_cpu(desc->eq_count);
+ res->max_cq_count = le16_to_cpu(desc->cq_count);
+ res->max_iface_count = le16_to_cpu(desc->iface_count);
+ res->max_mcc_count = le16_to_cpu(desc->mcc_count);
/* Clear flags that driver is not interested in */
res->if_cap_flags = le32_to_cpu(desc->cap_flags) &
BE_IF_CAP_FLAGS_WANT;
- /* Need 1 RXQ as the default RXQ */
- if (res->max_rss_qs && res->max_rss_qs == res->max_rx_qs)
- res->max_rss_qs -= 1;
}
/* Uses Mbox */
@@ -3644,7 +3644,7 @@ err:
/* Will use MBOX only if MCCQ has not been created */
int be_cmd_get_profile_config(struct be_adapter *adapter,
- struct be_resources *res, u8 domain)
+ struct be_resources *res, u8 query, u8 domain)
{
struct be_cmd_resp_get_profile_config *resp;
struct be_cmd_req_get_profile_config *req;
@@ -3654,7 +3654,7 @@ int be_cmd_get_profile_config(struct be_adapter *adapter,
struct be_nic_res_desc *nic;
struct be_mcc_wrb wrb = {0};
struct be_dma_mem cmd;
- u32 desc_count;
+ u16 desc_count;
int status;
memset(&cmd, 0, sizeof(struct be_dma_mem));
@@ -3673,12 +3673,19 @@ int be_cmd_get_profile_config(struct be_adapter *adapter,
req->hdr.version = 1;
req->type = ACTIVE_PROFILE_TYPE;
+ /* When QUERY_MODIFIABLE_FIELDS_TYPE bit is set, cmd returns the
+ * descriptors with all bits set to "1" for the fields which can be
+ * modified using SET_PROFILE_CONFIG cmd.
+ */
+ if (query == RESOURCE_MODIFIABLE)
+ req->type |= QUERY_MODIFIABLE_FIELDS_TYPE;
+
status = be_cmd_notify_wait(adapter, &wrb);
if (status)
goto err;
resp = cmd.va;
- desc_count = le32_to_cpu(resp->desc_count);
+ desc_count = le16_to_cpu(resp->desc_count);
pcie = be_get_pcie_desc(adapter->pdev->devfn, resp->func_param,
desc_count);
@@ -3803,23 +3810,80 @@ int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, u16 link_speed,
1, version, domain);
}
+static void be_fill_vf_res_template(struct be_adapter *adapter,
+ struct be_resources pool_res,
+ u16 num_vfs, u16 num_vf_qs,
+ struct be_nic_res_desc *nic_vft)
+{
+ u32 vf_if_cap_flags = pool_res.vf_if_cap_flags;
+ struct be_resources res_mod = {0};
+
+ /* Resource with fields set to all '1's by GET_PROFILE_CONFIG cmd,
+ * which are modifiable using SET_PROFILE_CONFIG cmd.
+ */
+ be_cmd_get_profile_config(adapter, &res_mod, RESOURCE_MODIFIABLE, 0);
+
+ /* If RSS IFACE capability flags are modifiable for a VF, set the
+ * capability flag as valid and set RSS and DEFQ_RSS IFACE flags if
+ * more than 1 RSSQ is available for a VF.
+ * Otherwise, provision only 1 queue pair for VF.
+ */
+ if (res_mod.vf_if_cap_flags & BE_IF_FLAGS_RSS) {
+ nic_vft->flags |= BIT(IF_CAPS_FLAGS_VALID_SHIFT);
+ if (num_vf_qs > 1) {
+ vf_if_cap_flags |= BE_IF_FLAGS_RSS;
+ if (pool_res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS)
+ vf_if_cap_flags |= BE_IF_FLAGS_DEFQ_RSS;
+ } else {
+ vf_if_cap_flags &= ~(BE_IF_FLAGS_RSS |
+ BE_IF_FLAGS_DEFQ_RSS);
+ }
+
+ nic_vft->cap_flags = cpu_to_le32(vf_if_cap_flags);
+ } else {
+ num_vf_qs = 1;
+ }
+
+ nic_vft->rq_count = cpu_to_le16(num_vf_qs);
+ nic_vft->txq_count = cpu_to_le16(num_vf_qs);
+ nic_vft->rssq_count = cpu_to_le16(num_vf_qs);
+ nic_vft->cq_count = cpu_to_le16(pool_res.max_cq_count /
+ (num_vfs + 1));
+
+ /* Distribute unicast MACs, VLANs, IFACE count and MCCQ count equally
+ * among the PF and it's VFs, if the fields are changeable
+ */
+ if (res_mod.max_uc_mac == FIELD_MODIFIABLE)
+ nic_vft->unicast_mac_count = cpu_to_le16(pool_res.max_uc_mac /
+ (num_vfs + 1));
+
+ if (res_mod.max_vlans == FIELD_MODIFIABLE)
+ nic_vft->vlan_count = cpu_to_le16(pool_res.max_vlans /
+ (num_vfs + 1));
+
+ if (res_mod.max_iface_count == FIELD_MODIFIABLE)
+ nic_vft->iface_count = cpu_to_le16(pool_res.max_iface_count /
+ (num_vfs + 1));
+
+ if (res_mod.max_mcc_count == FIELD_MODIFIABLE)
+ nic_vft->mcc_count = cpu_to_le16(pool_res.max_mcc_count /
+ (num_vfs + 1));
+}
+
int be_cmd_set_sriov_config(struct be_adapter *adapter,
- struct be_resources res, u16 num_vfs)
+ struct be_resources pool_res, u16 num_vfs,
+ u16 num_vf_qs)
{
struct {
struct be_pcie_res_desc pcie;
struct be_nic_res_desc nic_vft;
} __packed desc;
- u16 vf_q_count;
-
- if (BEx_chip(adapter) || lancer_chip(adapter))
- return 0;
/* PF PCIE descriptor */
be_reset_pcie_desc(&desc.pcie);
desc.pcie.hdr.desc_type = PCIE_RESOURCE_DESC_TYPE_V1;
desc.pcie.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
- desc.pcie.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+ desc.pcie.flags = BIT(IMM_SHIFT) | BIT(NOSV_SHIFT);
desc.pcie.pf_num = adapter->pdev->devfn;
desc.pcie.sriov_state = num_vfs ? 1 : 0;
desc.pcie.num_vfs = cpu_to_le16(num_vfs);
@@ -3828,32 +3892,12 @@ int be_cmd_set_sriov_config(struct be_adapter *adapter,
be_reset_nic_desc(&desc.nic_vft);
desc.nic_vft.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1;
desc.nic_vft.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
- desc.nic_vft.flags = (1 << VFT_SHIFT) | (1 << IMM_SHIFT) |
- (1 << NOSV_SHIFT);
+ desc.nic_vft.flags = BIT(VFT_SHIFT) | BIT(IMM_SHIFT) | BIT(NOSV_SHIFT);
desc.nic_vft.pf_num = adapter->pdev->devfn;
desc.nic_vft.vf_num = 0;
- if (num_vfs && res.vf_if_cap_flags & BE_IF_FLAGS_RSS) {
- /* If number of VFs requested is 8 less than max supported,
- * assign 8 queue pairs to the PF and divide the remaining
- * resources evenly among the VFs
- */
- if (num_vfs < (be_max_vfs(adapter) - 8))
- vf_q_count = (res.max_rss_qs - 8) / num_vfs;
- else
- vf_q_count = res.max_rss_qs / num_vfs;
-
- desc.nic_vft.rq_count = cpu_to_le16(vf_q_count);
- desc.nic_vft.txq_count = cpu_to_le16(vf_q_count);
- desc.nic_vft.rssq_count = cpu_to_le16(vf_q_count - 1);
- desc.nic_vft.cq_count = cpu_to_le16(3 * vf_q_count);
- } else {
- desc.nic_vft.txq_count = cpu_to_le16(1);
- desc.nic_vft.rq_count = cpu_to_le16(1);
- desc.nic_vft.rssq_count = cpu_to_le16(0);
- /* One CQ for each TX, RX and MCCQ */
- desc.nic_vft.cq_count = cpu_to_le16(3);
- }
+ be_fill_vf_res_template(adapter, pool_res, num_vfs, num_vf_qs,
+ &desc.nic_vft);
return be_cmd_set_profile_config(adapter, &desc,
2 * RESOURCE_DESC_SIZE_V1, 2, 1, 0);
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index db761e8e42a3..53e903f37247 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -588,14 +588,15 @@ enum be_if_flags {
BE_IF_FLAGS_MCAST_PROMISCUOUS = 0x200,
BE_IF_FLAGS_PASS_L2_ERRORS = 0x400,
BE_IF_FLAGS_PASS_L3L4_ERRORS = 0x800,
- BE_IF_FLAGS_MULTICAST = 0x1000
+ BE_IF_FLAGS_MULTICAST = 0x1000,
+ BE_IF_FLAGS_DEFQ_RSS = 0x1000000
};
#define BE_IF_CAP_FLAGS_WANT (BE_IF_FLAGS_RSS | BE_IF_FLAGS_PROMISCUOUS |\
BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_VLAN_PROMISCUOUS |\
BE_IF_FLAGS_VLAN | BE_IF_FLAGS_MCAST_PROMISCUOUS |\
BE_IF_FLAGS_PASS_L3L4_ERRORS | BE_IF_FLAGS_MULTICAST |\
- BE_IF_FLAGS_UNTAGGED)
+ BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_DEFQ_RSS)
#define BE_IF_FLAGS_ALL_PROMISCUOUS (BE_IF_FLAGS_PROMISCUOUS | \
BE_IF_FLAGS_VLAN_PROMISCUOUS |\
@@ -2021,6 +2022,7 @@ struct be_cmd_req_set_ext_fat_caps {
#define PORT_RESOURCE_DESC_TYPE_V1 0x55
#define MAX_RESOURCE_DESC 264
+#define IF_CAPS_FLAGS_VALID_SHIFT 0 /* IF caps valid */
#define VFT_SHIFT 3 /* VF template */
#define IMM_SHIFT 6 /* Immediate */
#define NOSV_SHIFT 7 /* No save */
@@ -2131,20 +2133,28 @@ struct be_cmd_resp_get_func_config {
u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
};
-#define ACTIVE_PROFILE_TYPE 0x2
+enum {
+ RESOURCE_LIMITS,
+ RESOURCE_MODIFIABLE
+};
+
struct be_cmd_req_get_profile_config {
struct be_cmd_req_hdr hdr;
u8 rsvd;
+#define ACTIVE_PROFILE_TYPE 0x2
+#define QUERY_MODIFIABLE_FIELDS_TYPE BIT(3)
u8 type;
u16 rsvd1;
};
struct be_cmd_resp_get_profile_config {
struct be_cmd_resp_hdr hdr;
- u32 desc_count;
+ __le16 desc_count;
+ u16 rsvd;
u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
};
+#define FIELD_MODIFIABLE 0xFFFF
struct be_cmd_req_set_profile_config {
struct be_cmd_req_hdr hdr;
u32 rsvd;
@@ -2344,7 +2354,7 @@ int be_cmd_query_port_name(struct be_adapter *adapter);
int be_cmd_get_func_config(struct be_adapter *adapter,
struct be_resources *res);
int be_cmd_get_profile_config(struct be_adapter *adapter,
- struct be_resources *res, u8 domain);
+ struct be_resources *res, u8 query, u8 domain);
int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile);
int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg,
int vf_num);
@@ -2355,4 +2365,5 @@ int be_cmd_set_logical_link_config(struct be_adapter *adapter,
int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port);
int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op);
int be_cmd_set_sriov_config(struct be_adapter *adapter,
- struct be_resources res, u16 num_vfs);
+ struct be_resources res, u16 num_vfs,
+ u16 num_vf_qs);
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 4d2de4700769..b765c24625bf 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1097,7 +1097,7 @@ static int be_set_rss_hash_opts(struct be_adapter *adapter,
return status;
if (be_multi_rxq(adapter)) {
- for (j = 0; j < 128; j += adapter->num_rx_qs - 1) {
+ for (j = 0; j < 128; j += adapter->num_rss_qs) {
for_all_rss_queues(adapter, rxo, i) {
if ((j + i) >= 128)
break;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 7eccebc676e2..5652b005947f 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -30,6 +30,9 @@ MODULE_DESCRIPTION(DRV_DESC " " DRV_VER);
MODULE_AUTHOR("Emulex Corporation");
MODULE_LICENSE("GPL");
+/* num_vfs module param is obsolete.
+ * Use sysfs method to enable/disable VFs.
+ */
static unsigned int num_vfs;
module_param(num_vfs, uint, S_IRUGO);
MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize");
@@ -2454,13 +2457,19 @@ static int be_rx_cqs_create(struct be_adapter *adapter)
int rc, i;
/* We can create as many RSS rings as there are EQs. */
- adapter->num_rx_qs = adapter->num_evt_qs;
+ adapter->num_rss_qs = adapter->num_evt_qs;
+
+ /* We'll use RSS only if atleast 2 RSS rings are supported. */
+ if (adapter->num_rss_qs <= 1)
+ adapter->num_rss_qs = 0;
- /* We'll use RSS only if atleast 2 RSS rings are supported.
- * When RSS is used, we'll need a default RXQ for non-IP traffic.
+ adapter->num_rx_qs = adapter->num_rss_qs + adapter->need_def_rxq;
+
+ /* When the interface is not capable of RSS rings (and there is no
+ * need to create a default RXQ) we'll still need one RXQ
*/
- if (adapter->num_rx_qs > 1)
- adapter->num_rx_qs++;
+ if (adapter->num_rx_qs == 0)
+ adapter->num_rx_qs = 1;
adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE;
for_all_rx_queues(adapter, rxo, i) {
@@ -2479,8 +2488,7 @@ static int be_rx_cqs_create(struct be_adapter *adapter)
}
dev_info(&adapter->pdev->dev,
- "created %d RSS queue(s) and 1 default RX queue\n",
- adapter->num_rx_qs - 1);
+ "created %d RX queue(s)\n", adapter->num_rx_qs);
return 0;
}
@@ -3110,12 +3118,14 @@ static int be_rx_qs_create(struct be_adapter *adapter)
return rc;
}
- /* The FW would like the default RXQ to be created first */
- rxo = default_rxo(adapter);
- rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size,
- adapter->if_handle, false, &rxo->rss_id);
- if (rc)
- return rc;
+ if (adapter->need_def_rxq || !adapter->num_rss_qs) {
+ rxo = default_rxo(adapter);
+ rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
+ rx_frag_size, adapter->if_handle,
+ false, &rxo->rss_id);
+ if (rc)
+ return rc;
+ }
for_all_rss_queues(adapter, rxo, i) {
rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
@@ -3126,8 +3136,7 @@ static int be_rx_qs_create(struct be_adapter *adapter)
}
if (be_multi_rxq(adapter)) {
- for (j = 0; j < RSS_INDIR_TABLE_LEN;
- j += adapter->num_rx_qs - 1) {
+ for (j = 0; j < RSS_INDIR_TABLE_LEN; j += adapter->num_rss_qs) {
for_all_rss_queues(adapter, rxo, i) {
if ((j + i) >= RSS_INDIR_TABLE_LEN)
break;
@@ -3402,8 +3411,39 @@ static void be_disable_vxlan_offloads(struct be_adapter *adapter)
}
#endif
+static u16 be_calculate_vf_qs(struct be_adapter *adapter, u16 num_vfs)
+{
+ struct be_resources res = adapter->pool_res;
+ u16 num_vf_qs = 1;
+
+ /* Distribute the queue resources equally among the PF and it's VFs
+ * Do not distribute queue resources in multi-channel configuration.
+ */
+ if (num_vfs && !be_is_mc(adapter)) {
+ /* If number of VFs requested is 8 less than max supported,
+ * assign 8 queue pairs to the PF and divide the remaining
+ * resources evenly among the VFs
+ */
+ if (num_vfs < (be_max_vfs(adapter) - 8))
+ num_vf_qs = (res.max_rss_qs - 8) / num_vfs;
+ else
+ num_vf_qs = res.max_rss_qs / num_vfs;
+
+ /* Skyhawk-R chip supports only MAX_RSS_IFACES RSS capable
+ * interfaces per port. Provide RSS on VFs, only if number
+ * of VFs requested is less than MAX_RSS_IFACES limit.
+ */
+ if (num_vfs >= MAX_RSS_IFACES)
+ num_vf_qs = 1;
+ }
+ return num_vf_qs;
+}
+
static int be_clear(struct be_adapter *adapter)
{
+ struct pci_dev *pdev = adapter->pdev;
+ u16 num_vf_qs;
+
be_cancel_worker(adapter);
if (sriov_enabled(adapter))
@@ -3412,9 +3452,14 @@ static int be_clear(struct be_adapter *adapter)
/* Re-configure FW to distribute resources evenly across max-supported
* number of VFs, only when VFs are not already enabled.
*/
- if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev))
+ if (skyhawk_chip(adapter) && be_physfn(adapter) &&
+ !pci_vfs_assigned(pdev)) {
+ num_vf_qs = be_calculate_vf_qs(adapter,
+ pci_sriov_get_totalvfs(pdev));
be_cmd_set_sriov_config(adapter, adapter->pool_res,
- pci_sriov_get_totalvfs(adapter->pdev));
+ pci_sriov_get_totalvfs(pdev),
+ num_vf_qs);
+ }
#ifdef CONFIG_BE2NET_VXLAN
be_disable_vxlan_offloads(adapter);
@@ -3439,7 +3484,7 @@ static int be_if_create(struct be_adapter *adapter, u32 *if_handle,
en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |
- BE_IF_FLAGS_RSS;
+ BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS;
en_flags &= cap_flags;
@@ -3463,6 +3508,7 @@ static int be_vfs_if_create(struct be_adapter *adapter)
for_all_vfs(adapter, vf_cfg, vf) {
if (!BE3_chip(adapter)) {
status = be_cmd_get_profile_config(adapter, &res,
+ RESOURCE_LIMITS,
vf + 1);
if (!status)
cap_flags = res.if_cap_flags;
@@ -3629,7 +3675,8 @@ static void BEx_get_resources(struct be_adapter *adapter,
/* On a SuperNIC profile, the driver needs to use the
* GET_PROFILE_CONFIG cmd to query the per-function TXQ limits
*/
- be_cmd_get_profile_config(adapter, &super_nic_res, 0);
+ be_cmd_get_profile_config(adapter, &super_nic_res,
+ RESOURCE_LIMITS, 0);
/* Some old versions of BE3 FW don't report max_tx_qs value */
res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS;
} else {
@@ -3649,6 +3696,7 @@ static void BEx_get_resources(struct be_adapter *adapter,
res->max_evt_qs = 1;
res->if_cap_flags = BE_IF_CAP_FLAGS_WANT;
+ res->if_cap_flags &= ~BE_IF_FLAGS_DEFQ_RSS;
if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS))
res->if_cap_flags &= ~BE_IF_FLAGS_RSS;
}
@@ -3668,13 +3716,12 @@ static void be_setup_init(struct be_adapter *adapter)
static int be_get_sriov_config(struct be_adapter *adapter)
{
- struct device *dev = &adapter->pdev->dev;
struct be_resources res = {0};
int max_vfs, old_vfs;
- /* Some old versions of BE3 FW don't report max_vfs value */
- be_cmd_get_profile_config(adapter, &res, 0);
+ be_cmd_get_profile_config(adapter, &res, RESOURCE_LIMITS, 0);
+ /* Some old versions of BE3 FW don't report max_vfs value */
if (BE3_chip(adapter) && !res.max_vfs) {
max_vfs = pci_sriov_get_totalvfs(adapter->pdev);
res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0;
@@ -3682,35 +3729,49 @@ static int be_get_sriov_config(struct be_adapter *adapter)
adapter->pool_res = res;
- if (!be_max_vfs(adapter)) {
- if (num_vfs)
- dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n");
- adapter->num_vfs = 0;
- return 0;
- }
-
- pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
-
- /* validate num_vfs module param */
+ /* If during previous unload of the driver, the VFs were not disabled,
+ * then we cannot rely on the PF POOL limits for the TotalVFs value.
+ * Instead use the TotalVFs value stored in the pci-dev struct.
+ */
old_vfs = pci_num_vf(adapter->pdev);
if (old_vfs) {
- dev_info(dev, "%d VFs are already enabled\n", old_vfs);
- if (old_vfs != num_vfs)
- dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs);
+ dev_info(&adapter->pdev->dev, "%d VFs are already enabled\n",
+ old_vfs);
+
+ adapter->pool_res.max_vfs =
+ pci_sriov_get_totalvfs(adapter->pdev);
adapter->num_vfs = old_vfs;
- } else {
- if (num_vfs > be_max_vfs(adapter)) {
- dev_info(dev, "Resources unavailable to init %d VFs\n",
- num_vfs);
- dev_info(dev, "Limiting to %d VFs\n",
- be_max_vfs(adapter));
- }
- adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter));
}
return 0;
}
+static void be_alloc_sriov_res(struct be_adapter *adapter)
+{
+ int old_vfs = pci_num_vf(adapter->pdev);
+ u16 num_vf_qs;
+ int status;
+
+ be_get_sriov_config(adapter);
+
+ if (!old_vfs)
+ pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
+
+ /* When the HW is in SRIOV capable configuration, the PF-pool
+ * resources are given to PF during driver load, if there are no
+ * old VFs. This facility is not available in BE3 FW.
+ * Also, this is done by FW in Lancer chip.
+ */
+ if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) {
+ num_vf_qs = be_calculate_vf_qs(adapter, 0);
+ status = be_cmd_set_sriov_config(adapter, adapter->pool_res, 0,
+ num_vf_qs);
+ if (status)
+ dev_err(&adapter->pdev->dev,
+ "Failed to optimize SRIOV resources\n");
+ }
+}
+
static int be_get_resources(struct be_adapter *adapter)
{
struct device *dev = &adapter->pdev->dev;
@@ -3731,12 +3792,23 @@ static int be_get_resources(struct be_adapter *adapter)
if (status)
return status;
+ /* If a deafault RXQ must be created, we'll use up one RSSQ*/
+ if (res.max_rss_qs && res.max_rss_qs == res.max_rx_qs &&
+ !(res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS))
+ res.max_rss_qs -= 1;
+
/* If RoCE may be enabled stash away half the EQs for RoCE */
if (be_roce_supported(adapter))
res.max_evt_qs /= 2;
adapter->res = res;
}
+ /* If FW supports RSS default queue, then skip creating non-RSS
+ * queue for non-IP traffic.
+ */
+ adapter->need_def_rxq = (be_if_cap_flags(adapter) &
+ BE_IF_FLAGS_DEFQ_RSS) ? 0 : 1;
+
dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n",
be_max_txqs(adapter), be_max_rxqs(adapter),
be_max_rss(adapter), be_max_eqs(adapter),
@@ -3745,38 +3817,12 @@ static int be_get_resources(struct be_adapter *adapter)
be_max_uc(adapter), be_max_mc(adapter),
be_max_vlans(adapter));
+ /* Sanitize cfg_num_qs based on HW and platform limits */
+ adapter->cfg_num_qs = min_t(u16, netif_get_num_default_rss_queues(),
+ be_max_qs(adapter));
return 0;
}
-static void be_sriov_config(struct be_adapter *adapter)
-{
- struct device *dev = &adapter->pdev->dev;
- int status;
-
- status = be_get_sriov_config(adapter);
- if (status) {
- dev_err(dev, "Failed to query SR-IOV configuration\n");
- dev_err(dev, "SR-IOV cannot be enabled\n");
- return;
- }
-
- /* When the HW is in SRIOV capable configuration, the PF-pool
- * resources are equally distributed across the max-number of
- * VFs. The user may request only a subset of the max-vfs to be
- * enabled. Based on num_vfs, redistribute the resources across
- * num_vfs so that each VF will have access to more number of
- * resources. This facility is not available in BE3 FW.
- * Also, this is done by FW in Lancer chip.
- */
- if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) {
- status = be_cmd_set_sriov_config(adapter,
- adapter->pool_res,
- adapter->num_vfs);
- if (status)
- dev_err(dev, "Failed to optimize SR-IOV resources\n");
- }
-}
-
static int be_get_config(struct be_adapter *adapter)
{
int status, level;
@@ -3807,9 +3853,6 @@ static int be_get_config(struct be_adapter *adapter)
"Using profile 0x%x\n", profile_id);
}
- if (!BE2_chip(adapter) && be_physfn(adapter))
- be_sriov_config(adapter);
-
status = be_get_resources(adapter);
if (status)
return status;
@@ -3819,9 +3862,6 @@ static int be_get_config(struct be_adapter *adapter)
if (!adapter->pmac_id)
return -ENOMEM;
- /* Sanitize cfg_num_qs based on HW and platform limits */
- adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter));
-
return 0;
}
@@ -3996,6 +4036,9 @@ static int be_setup(struct be_adapter *adapter)
if (!lancer_chip(adapter))
be_cmd_req_native_mode(adapter);
+ if (!BE2_chip(adapter) && be_physfn(adapter))
+ be_alloc_sriov_res(adapter);
+
status = be_get_config(adapter);
if (status)
goto err;
@@ -5217,7 +5260,6 @@ static int be_drv_init(struct be_adapter *adapter)
/* Must be a power of 2 or else MODULO will BUG_ON */
adapter->be_get_temp_freq = 64;
- adapter->cfg_num_qs = netif_get_num_default_rss_queues();
return 0;
@@ -5541,6 +5583,60 @@ err:
dev_err(&adapter->pdev->dev, "EEH resume failed\n");
}
+static int be_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+ struct be_adapter *adapter = pci_get_drvdata(pdev);
+ u16 num_vf_qs;
+ int status;
+
+ if (!num_vfs)
+ be_vf_clear(adapter);
+
+ adapter->num_vfs = num_vfs;
+
+ if (adapter->num_vfs == 0 && pci_vfs_assigned(pdev)) {
+ dev_warn(&pdev->dev,
+ "Cannot disable VFs while they are assigned\n");
+ return -EBUSY;
+ }
+
+ /* When the HW is in SRIOV capable configuration, the PF-pool resources
+ * are equally distributed across the max-number of VFs. The user may
+ * request only a subset of the max-vfs to be enabled.
+ * Based on num_vfs, redistribute the resources across num_vfs so that
+ * each VF will have access to more number of resources.
+ * This facility is not available in BE3 FW.
+ * Also, this is done by FW in Lancer chip.
+ */
+ if (skyhawk_chip(adapter) && !pci_num_vf(pdev)) {
+ num_vf_qs = be_calculate_vf_qs(adapter, adapter->num_vfs);
+ status = be_cmd_set_sriov_config(adapter, adapter->pool_res,
+ adapter->num_vfs, num_vf_qs);
+ if (status)
+ dev_err(&pdev->dev,
+ "Failed to optimize SR-IOV resources\n");
+ }
+
+ status = be_get_resources(adapter);
+ if (status)
+ return be_cmd_status(status);
+
+ /* Updating real_num_tx/rx_queues() requires rtnl_lock() */
+ rtnl_lock();
+ status = be_update_queues(adapter);
+ rtnl_unlock();
+ if (status)
+ return be_cmd_status(status);
+
+ if (adapter->num_vfs)
+ status = be_vf_setup(adapter);
+
+ if (!status)
+ return adapter->num_vfs;
+
+ return 0;
+}
+
static const struct pci_error_handlers be_eeh_handlers = {
.error_detected = be_eeh_err_detected,
.slot_reset = be_eeh_reset,
@@ -5555,6 +5651,7 @@ static struct pci_driver be_driver = {
.suspend = be_suspend,
.resume = be_pci_resume,
.shutdown = be_shutdown,
+ .sriov_configure = be_pci_sriov_configure,
.err_handler = &be_eeh_handlers
};
@@ -5568,6 +5665,11 @@ static int __init be_init_module(void)
rx_frag_size = 2048;
}
+ if (num_vfs > 0) {
+ pr_info(DRV_NAME " : Module param num_vfs is obsolete.");
+ pr_info(DRV_NAME " : Use sysfs method to enable VFs\n");
+ }
+
return pci_register_driver(&be_driver);
}
module_init(be_init_module);
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 736d5d1624a1..7fb244f565b2 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -52,7 +52,12 @@
NETIF_MSG_RX_ERR| \
NETIF_MSG_TX_ERR)
+#define SH_ETH_OFFSET_DEFAULTS \
+ [0 ... SH_ETH_MAX_REGISTER_OFFSET - 1] = SH_ETH_OFFSET_INVALID
+
static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
+ SH_ETH_OFFSET_DEFAULTS,
+
[EDSR] = 0x0000,
[EDMR] = 0x0400,
[EDTRR] = 0x0408,
@@ -132,9 +137,6 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
[TSU_POST3] = 0x0078,
[TSU_POST4] = 0x007c,
[TSU_ADRH0] = 0x0100,
- [TSU_ADRL0] = 0x0104,
- [TSU_ADRH31] = 0x01f8,
- [TSU_ADRL31] = 0x01fc,
[TXNLCR0] = 0x0080,
[TXALCR0] = 0x0084,
@@ -151,6 +153,8 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
};
static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = {
+ SH_ETH_OFFSET_DEFAULTS,
+
[EDSR] = 0x0000,
[EDMR] = 0x0400,
[EDTRR] = 0x0408,
@@ -199,9 +203,6 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = {
[TSU_ADSBSY] = 0x0060,
[TSU_TEN] = 0x0064,
[TSU_ADRH0] = 0x0100,
- [TSU_ADRL0] = 0x0104,
- [TSU_ADRH31] = 0x01f8,
- [TSU_ADRL31] = 0x01fc,
[TXNLCR0] = 0x0080,
[TXALCR0] = 0x0084,
@@ -210,6 +211,8 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = {
};
static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
+ SH_ETH_OFFSET_DEFAULTS,
+
[ECMR] = 0x0300,
[RFLR] = 0x0308,
[ECSR] = 0x0310,
@@ -256,6 +259,8 @@ static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
};
static const u16 sh_eth_offset_fast_sh4[SH_ETH_MAX_REGISTER_OFFSET] = {
+ SH_ETH_OFFSET_DEFAULTS,
+
[ECMR] = 0x0100,
[RFLR] = 0x0108,
[ECSR] = 0x0110,
@@ -308,6 +313,8 @@ static const u16 sh_eth_offset_fast_sh4[SH_ETH_MAX_REGISTER_OFFSET] = {
};
static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = {
+ SH_ETH_OFFSET_DEFAULTS,
+
[EDMR] = 0x0000,
[EDTRR] = 0x0004,
[EDRRR] = 0x0008,
@@ -392,8 +399,6 @@ static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = {
[FWALCR1] = 0x00b4,
[TSU_ADRH0] = 0x0100,
- [TSU_ADRL0] = 0x0104,
- [TSU_ADRL31] = 0x01fc,
};
static void sh_eth_rcv_snd_disable(struct net_device *ndev);
@@ -588,6 +593,7 @@ static struct sh_eth_cpu_data sh7757_data = {
.no_ade = 1,
.rpadir = 1,
.rpadir_value = 2 << 16,
+ .rtrate = 1,
};
#define SH_GIGA_ETH_BASE 0xfee00000UL
@@ -1411,6 +1417,9 @@ static int sh_eth_txfree(struct net_device *ndev)
break;
/* TACT bit must be checked before all the following reads */
rmb();
+ netif_info(mdp, tx_done, ndev,
+ "tx entry %d status 0x%08x\n",
+ entry, edmac_to_cpu(mdp, txdesc->status));
/* Free the original skb. */
if (mdp->tx_skbuff[entry]) {
dma_unmap_single(&ndev->dev, txdesc->addr,
@@ -1456,6 +1465,10 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
if (--boguscnt < 0)
break;
+ netif_info(mdp, rx_status, ndev,
+ "rx entry %d status 0x%08x len %d\n",
+ entry, desc_status, pkt_len);
+
if (!(desc_status & RDFEND))
ndev->stats.rx_length_errors++;
@@ -1500,6 +1513,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
netif_receive_skb(skb);
ndev->stats.rx_packets++;
ndev->stats.rx_bytes += pkt_len;
+ if (desc_status & RD_RFS8)
+ ndev->stats.multicast++;
}
entry = (++mdp->cur_rx) % mdp->num_rx_ring;
rxdesc = &mdp->rx_ring[entry];
@@ -1542,7 +1557,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
/* If we don't need to check status, don't. -KDU */
if (!(sh_eth_read(ndev, EDRRR) & EDRRR_R)) {
/* fix the values for the next receiving if RDE is set */
- if (intr_status & EESR_RDE && mdp->reg_offset[RDFAR] != 0) {
+ if (intr_status & EESR_RDE &&
+ mdp->reg_offset[RDFAR] != SH_ETH_OFFSET_INVALID) {
u32 count = (sh_eth_read(ndev, RDFAR) -
sh_eth_read(ndev, RDLAR)) >> 4;
@@ -1929,6 +1945,192 @@ error_exit:
return ret;
}
+/* If it is ever necessary to increase SH_ETH_REG_DUMP_MAX_REGS, the
+ * version must be bumped as well. Just adding registers up to that
+ * limit is fine, as long as the existing register indices don't
+ * change.
+ */
+#define SH_ETH_REG_DUMP_VERSION 1
+#define SH_ETH_REG_DUMP_MAX_REGS 256
+
+static size_t __sh_eth_get_regs(struct net_device *ndev, u32 *buf)
+{
+ struct sh_eth_private *mdp = netdev_priv(ndev);
+ struct sh_eth_cpu_data *cd = mdp->cd;
+ u32 *valid_map;
+ size_t len;
+
+ BUILD_BUG_ON(SH_ETH_MAX_REGISTER_OFFSET > SH_ETH_REG_DUMP_MAX_REGS);
+
+ /* Dump starts with a bitmap that tells ethtool which
+ * registers are defined for this chip.
+ */
+ len = DIV_ROUND_UP(SH_ETH_REG_DUMP_MAX_REGS, 32);
+ if (buf) {
+ valid_map = buf;
+ buf += len;
+ } else {
+ valid_map = NULL;
+ }
+
+ /* Add a register to the dump, if it has a defined offset.
+ * This automatically skips most undefined registers, but for
+ * some it is also necessary to check a capability flag in
+ * struct sh_eth_cpu_data.
+ */
+#define mark_reg_valid(reg) valid_map[reg / 32] |= 1U << (reg % 32)
+#define add_reg_from(reg, read_expr) do { \
+ if (mdp->reg_offset[reg] != SH_ETH_OFFSET_INVALID) { \
+ if (buf) { \
+ mark_reg_valid(reg); \
+ *buf++ = read_expr; \
+ } \
+ ++len; \
+ } \
+ } while (0)
+#define add_reg(reg) add_reg_from(reg, sh_eth_read(ndev, reg))
+#define add_tsu_reg(reg) add_reg_from(reg, sh_eth_tsu_read(mdp, reg))
+
+ add_reg(EDSR);
+ add_reg(EDMR);
+ add_reg(EDTRR);
+ add_reg(EDRRR);
+ add_reg(EESR);
+ add_reg(EESIPR);
+ add_reg(TDLAR);
+ add_reg(TDFAR);
+ add_reg(TDFXR);
+ add_reg(TDFFR);
+ add_reg(RDLAR);
+ add_reg(RDFAR);
+ add_reg(RDFXR);
+ add_reg(RDFFR);
+ add_reg(TRSCER);
+ add_reg(RMFCR);
+ add_reg(TFTR);
+ add_reg(FDR);
+ add_reg(RMCR);
+ add_reg(TFUCR);
+ add_reg(RFOCR);
+ if (cd->rmiimode)
+ add_reg(RMIIMODE);
+ add_reg(FCFTR);
+ if (cd->rpadir)
+ add_reg(RPADIR);
+ if (!cd->no_trimd)
+ add_reg(TRIMD);
+ add_reg(ECMR);
+ add_reg(ECSR);
+ add_reg(ECSIPR);
+ add_reg(PIR);
+ if (!cd->no_psr)
+ add_reg(PSR);
+ add_reg(RDMLR);
+ add_reg(RFLR);
+ add_reg(IPGR);
+ if (cd->apr)
+ add_reg(APR);
+ if (cd->mpr)
+ add_reg(MPR);
+ add_reg(RFCR);
+ add_reg(RFCF);
+ if (cd->tpauser)
+ add_reg(TPAUSER);
+ add_reg(TPAUSECR);
+ add_reg(GECMR);
+ if (cd->bculr)
+ add_reg(BCULR);
+ add_reg(MAHR);
+ add_reg(MALR);
+ add_reg(TROCR);
+ add_reg(CDCR);
+ add_reg(LCCR);
+ add_reg(CNDCR);
+ add_reg(CEFCR);
+ add_reg(FRECR);
+ add_reg(TSFRCR);
+ add_reg(TLFRCR);
+ add_reg(CERCR);
+ add_reg(CEECR);
+ add_reg(MAFCR);
+ if (cd->rtrate)
+ add_reg(RTRATE);
+ if (cd->hw_crc)
+ add_reg(CSMR);
+ if (cd->select_mii)
+ add_reg(RMII_MII);
+ add_reg(ARSTR);
+ if (cd->tsu) {
+ add_tsu_reg(TSU_CTRST);
+ add_tsu_reg(TSU_FWEN0);
+ add_tsu_reg(TSU_FWEN1);
+ add_tsu_reg(TSU_FCM);
+ add_tsu_reg(TSU_BSYSL0);
+ add_tsu_reg(TSU_BSYSL1);
+ add_tsu_reg(TSU_PRISL0);
+ add_tsu_reg(TSU_PRISL1);
+ add_tsu_reg(TSU_FWSL0);
+ add_tsu_reg(TSU_FWSL1);
+ add_tsu_reg(TSU_FWSLC);
+ add_tsu_reg(TSU_QTAG0);
+ add_tsu_reg(TSU_QTAG1);
+ add_tsu_reg(TSU_QTAGM0);
+ add_tsu_reg(TSU_QTAGM1);
+ add_tsu_reg(TSU_FWSR);
+ add_tsu_reg(TSU_FWINMK);
+ add_tsu_reg(TSU_ADQT0);
+ add_tsu_reg(TSU_ADQT1);
+ add_tsu_reg(TSU_VTAG0);
+ add_tsu_reg(TSU_VTAG1);
+ add_tsu_reg(TSU_ADSBSY);
+ add_tsu_reg(TSU_TEN);
+ add_tsu_reg(TSU_POST1);
+ add_tsu_reg(TSU_POST2);
+ add_tsu_reg(TSU_POST3);
+ add_tsu_reg(TSU_POST4);
+ if (mdp->reg_offset[TSU_ADRH0] != SH_ETH_OFFSET_INVALID) {
+ /* This is the start of a table, not just a single
+ * register.
+ */
+ if (buf) {
+ unsigned int i;
+
+ mark_reg_valid(TSU_ADRH0);
+ for (i = 0; i < SH_ETH_TSU_CAM_ENTRIES * 2; i++)
+ *buf++ = ioread32(
+ mdp->tsu_addr +
+ mdp->reg_offset[TSU_ADRH0] +
+ i * 4);
+ }
+ len += SH_ETH_TSU_CAM_ENTRIES * 2;
+ }
+ }
+
+#undef mark_reg_valid
+#undef add_reg_from
+#undef add_reg
+#undef add_tsu_reg
+
+ return len * 4;
+}
+
+static int sh_eth_get_regs_len(struct net_device *ndev)
+{
+ return __sh_eth_get_regs(ndev, NULL);
+}
+
+static void sh_eth_get_regs(struct net_device *ndev, struct ethtool_regs *regs,
+ void *buf)
+{
+ struct sh_eth_private *mdp = netdev_priv(ndev);
+
+ regs->version = SH_ETH_REG_DUMP_VERSION;
+
+ pm_runtime_get_sync(&mdp->pdev->dev);
+ __sh_eth_get_regs(ndev, buf);
+ pm_runtime_put_sync(&mdp->pdev->dev);
+}
+
static int sh_eth_nway_reset(struct net_device *ndev)
{
struct sh_eth_private *mdp = netdev_priv(ndev);
@@ -2074,6 +2276,8 @@ static int sh_eth_set_ringparam(struct net_device *ndev,
static const struct ethtool_ops sh_eth_ethtool_ops = {
.get_settings = sh_eth_get_settings,
.set_settings = sh_eth_set_settings,
+ .get_regs_len = sh_eth_get_regs_len,
+ .get_regs = sh_eth_get_regs,
.nway_reset = sh_eth_nway_reset,
.get_msglevel = sh_eth_get_msglevel,
.set_msglevel = sh_eth_set_msglevel,
@@ -2213,6 +2417,22 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
return NETDEV_TX_OK;
}
+/* The statistics registers have write-clear behaviour, which means we
+ * will lose any increment between the read and write. We mitigate
+ * this by only clearing when we read a non-zero value, so we will
+ * never falsely report a total of zero.
+ */
+static void
+sh_eth_update_stat(struct net_device *ndev, unsigned long *stat, int reg)
+{
+ u32 delta = sh_eth_read(ndev, reg);
+
+ if (delta) {
+ *stat += delta;
+ sh_eth_write(ndev, 0, reg);
+ }
+}
+
static struct net_device_stats *sh_eth_get_stats(struct net_device *ndev)
{
struct sh_eth_private *mdp = netdev_priv(ndev);
@@ -2223,21 +2443,18 @@ static struct net_device_stats *sh_eth_get_stats(struct net_device *ndev)
if (!mdp->is_opened)
return &ndev->stats;
- ndev->stats.tx_dropped += sh_eth_read(ndev, TROCR);
- sh_eth_write(ndev, 0, TROCR); /* (write clear) */
- ndev->stats.collisions += sh_eth_read(ndev, CDCR);
- sh_eth_write(ndev, 0, CDCR); /* (write clear) */
- ndev->stats.tx_carrier_errors += sh_eth_read(ndev, LCCR);
- sh_eth_write(ndev, 0, LCCR); /* (write clear) */
+ sh_eth_update_stat(ndev, &ndev->stats.tx_dropped, TROCR);
+ sh_eth_update_stat(ndev, &ndev->stats.collisions, CDCR);
+ sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors, LCCR);
if (sh_eth_is_gether(mdp)) {
- ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CERCR);
- sh_eth_write(ndev, 0, CERCR); /* (write clear) */
- ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CEECR);
- sh_eth_write(ndev, 0, CEECR); /* (write clear) */
+ sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors,
+ CERCR);
+ sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors,
+ CEECR);
} else {
- ndev->stats.tx_carrier_errors += sh_eth_read(ndev, CNDCR);
- sh_eth_write(ndev, 0, CNDCR); /* (write clear) */
+ sh_eth_update_stat(ndev, &ndev->stats.tx_carrier_errors,
+ CNDCR);
}
return &ndev->stats;
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 259d03f353e1..06dbbe5201cb 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -32,6 +32,10 @@
#define SH_ETH_TSU_CAM_ENTRIES 32
enum {
+ /* IMPORTANT: To keep ethtool register dump working, add new
+ * register names immediately before SH_ETH_MAX_REGISTER_OFFSET.
+ */
+
/* E-DMAC registers */
EDSR = 0,
EDMR,
@@ -131,9 +135,7 @@ enum {
TSU_POST3,
TSU_POST4,
TSU_ADRH0,
- TSU_ADRL0,
- TSU_ADRH31,
- TSU_ADRL31,
+ /* TSU_ADR{H,L}{0..31} are assumed to be contiguous */
TXNLCR0,
TXALCR0,
@@ -491,6 +493,7 @@ struct sh_eth_cpu_data {
unsigned select_mii:1; /* EtherC have RMII_MII (MII select register) */
unsigned shift_rd0:1; /* shift Rx descriptor word 0 right by 16 */
unsigned rmiimode:1; /* EtherC has RMIIMODE register */
+ unsigned rtrate:1; /* EtherC has RTRATE register */
};
struct sh_eth_private {
@@ -543,19 +546,29 @@ static inline void sh_eth_soft_swap(char *src, int len)
#endif
}
+#define SH_ETH_OFFSET_INVALID ((u16) ~0)
+
static inline void sh_eth_write(struct net_device *ndev, u32 data,
int enum_index)
{
struct sh_eth_private *mdp = netdev_priv(ndev);
+ u16 offset = mdp->reg_offset[enum_index];
- iowrite32(data, mdp->addr + mdp->reg_offset[enum_index]);
+ if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+ return;
+
+ iowrite32(data, mdp->addr + offset);
}
static inline u32 sh_eth_read(struct net_device *ndev, int enum_index)
{
struct sh_eth_private *mdp = netdev_priv(ndev);
+ u16 offset = mdp->reg_offset[enum_index];
+
+ if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+ return ~0U;
- return ioread32(mdp->addr + mdp->reg_offset[enum_index]);
+ return ioread32(mdp->addr + offset);
}
static inline void *sh_eth_tsu_get_offset(struct sh_eth_private *mdp,
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5c19cba34dce..fab4d0ddf4ed 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -181,6 +181,7 @@ struct ucred {
#define AF_WANPIPE 25 /* Wanpipe API Sockets */
#define AF_LLC 26 /* Linux LLC */
#define AF_IB 27 /* Native InfiniBand address */
+#define AF_MPLS 28 /* MPLS */
#define AF_CAN 29 /* Controller Area Network */
#define AF_TIPC 30 /* TIPC sockets */
#define AF_BLUETOOTH 31 /* Bluetooth sockets */
@@ -226,6 +227,7 @@ struct ucred {
#define PF_WANPIPE AF_WANPIPE
#define PF_LLC AF_LLC
#define PF_IB AF_IB
+#define PF_MPLS AF_MPLS
#define PF_CAN AF_CAN
#define PF_TIPC AF_TIPC
#define PF_BLUETOOTH AF_BLUETOOTH
diff --git a/include/net/arp.h b/include/net/arp.h
index 21ee1860abbc..5e0f891d476c 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -9,28 +9,17 @@
extern struct neigh_table arp_tbl;
-static inline u32 arp_hashfn(u32 key, const struct net_device *dev, u32 hash_rnd)
+static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd)
{
+ u32 key = *(const u32 *)pkey;
u32 val = key ^ hash32_ptr(dev);
- return val * hash_rnd;
+ return val * hash_rnd[0];
}
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
- struct neigh_hash_table *nht = rcu_dereference_bh(arp_tbl.nht);
- struct neighbour *n;
- u32 hash_val;
-
- hash_val = arp_hashfn(key, dev, nht->hash_rnd[0]) >> (32 - nht->hash_shift);
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- if (n->dev == dev && *(u32 *)n->primary_key == key)
- return n;
- }
-
- return NULL;
+ return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 6bbda34d5e59..b3a7751251b4 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -156,24 +156,7 @@ static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, _
static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey)
{
- struct neigh_hash_table *nht;
- const u32 *p32 = pkey;
- struct neighbour *n;
- u32 hash_val;
-
- nht = rcu_dereference_bh(nd_tbl.nht);
- hash_val = ndisc_hashfn(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- u32 *n32 = (u32 *) n->primary_key;
- if (n->dev == dev &&
- ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
- (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0)
- return n;
- }
-
- return NULL;
+ return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
}
static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9f912e4d4232..afb8237b0a8c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -197,6 +197,7 @@ struct neigh_table {
__u32 (*hash)(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
+ bool (*key_eq)(const struct neighbour *, const void *pkey);
int (*constructor)(struct neighbour *);
int (*pconstructor)(struct pneigh_entry *);
void (*pdestructor)(struct pneigh_entry *);
@@ -247,6 +248,57 @@ static inline void *neighbour_priv(const struct neighbour *n)
#define NEIGH_UPDATE_F_ISROUTER 0x40000000
#define NEIGH_UPDATE_F_ADMIN 0x80000000
+
+static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey)
+{
+ return *(const u16 *)n->primary_key == *(const u16 *)pkey;
+}
+
+static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
+{
+ return *(const u32 *)n->primary_key == *(const u32 *)pkey;
+}
+
+static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey)
+{
+ const u32 *n32 = (const u32 *)n->primary_key;
+ const u32 *p32 = pkey;
+
+ return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
+ (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0;
+}
+
+static inline struct neighbour *___neigh_lookup_noref(
+ struct neigh_table *tbl,
+ bool (*key_eq)(const struct neighbour *n, const void *pkey),
+ __u32 (*hash)(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd),
+ const void *pkey,
+ struct net_device *dev)
+{
+ struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht);
+ struct neighbour *n;
+ u32 hash_val;
+
+ hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (n->dev == dev && key_eq(n, pkey))
+ return n;
+ }
+
+ return NULL;
+}
+
+static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl,
+ const void *pkey,
+ struct net_device *dev)
+{
+ return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev);
+}
+
void neigh_table_init(int index, struct neigh_table *tbl);
int neigh_table_clear(int index, struct neigh_table *tbl);
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -306,6 +358,7 @@ void neigh_for_each(struct neigh_table *tbl,
void (*cb)(struct neighbour *, void *), void *cookie);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *));
+int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *);
void pneigh_for_each(struct neigh_table *tbl,
void (*cb)(struct pneigh_entry *));
@@ -459,4 +512,6 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
memcpy(dst, n->ha, dev->addr_len);
} while (read_seqretry(&n->ha_lock, seq));
}
+
+
#endif
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 36faf4990c4b..2cb9acb618e9 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -26,6 +26,7 @@
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
+#include <net/netns/mpls.h>
#include <linux/ns_common.h>
struct user_namespace;
@@ -130,6 +131,9 @@ struct net {
#if IS_ENABLED(CONFIG_IP_VS)
struct netns_ipvs *ipvs;
#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ struct netns_mpls mpls;
+#endif
struct sock *diag_nlsk;
atomic_t fnhe_genid;
};
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
new file mode 100644
index 000000000000..d29203651c01
--- /dev/null
+++ b/include/net/netns/mpls.h
@@ -0,0 +1,17 @@
+/*
+ * mpls in net namespaces
+ */
+
+#ifndef __NETNS_MPLS_H__
+#define __NETNS_MPLS_H__
+
+struct mpls_route;
+struct ctl_table_header;
+
+struct netns_mpls {
+ size_t platform_labels;
+ struct mpls_route __rcu * __rcu *platform_label;
+ struct ctl_table_header *ctl;
+};
+
+#endif /* __NETNS_MPLS_H__ */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5cc5d66bf519..06f75a407f74 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -303,6 +303,8 @@ enum rtattr_type_t {
RTA_TABLE,
RTA_MARK,
RTA_MFC_STATS,
+ RTA_VIA,
+ RTA_NEWDST,
__RTA_MAX
};
@@ -344,6 +346,12 @@ struct rtnexthop {
#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len))
#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
+/* RTA_VIA */
+struct rtvia {
+ __kernel_sa_family_t rtvia_family;
+ __u8 rtvia_addr[0];
+};
+
/* RTM_CACHEINFO */
struct rta_cacheinfo {
@@ -623,6 +631,8 @@ enum rtnetlink_groups {
#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
RTNLGRP_MDB,
#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
diff --git a/net/Makefile b/net/Makefile
index 38704bdf941a..3995613e5510 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/
obj-$(CONFIG_NFC) += nfc/
obj-$(CONFIG_OPENVSWITCH) += openvswitch/
obj-$(CONFIG_VSOCKETS) += vmw_vsock/
-obj-$(CONFIG_NET_MPLS_GSO) += mpls/
+obj-$(CONFIG_MPLS) += mpls/
obj-$(CONFIG_HSR) += hsr/
ifneq ($(CONFIG_NET_SWITCHDEV),)
obj-y += switchdev/
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0f48ea3affed..cffaf00561e7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -397,25 +397,15 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
- int key_len = tbl->key_len;
- u32 hash_val;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
-
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- if (!atomic_inc_not_zero(&n->refcnt))
- n = NULL;
- NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
- }
+ n = __neigh_lookup_noref(tbl, pkey, dev);
+ if (n) {
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
}
rcu_read_unlock_bh();
@@ -2401,6 +2391,40 @@ void __neigh_for_each_release(struct neigh_table *tbl,
}
EXPORT_SYMBOL(__neigh_for_each_release);
+int neigh_xmit(int family, struct net_device *dev,
+ const void *addr, struct sk_buff *skb)
+{
+ int err;
+ if (family == AF_PACKET) {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ addr, NULL, skb->len);
+ if (err < 0)
+ goto out_kfree_skb;
+ err = dev_queue_xmit(skb);
+ } else {
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+
+ err = -ENETDOWN;
+ tbl = neigh_find_table(family);
+ if (!tbl)
+ goto out;
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ if (!neigh)
+ neigh = __neigh_create(tbl, addr, dev, false);
+ err = PTR_ERR(neigh);
+ if (IS_ERR(neigh))
+ goto out_kfree_skb;
+ err = neigh->output(neigh, skb);
+ }
+out:
+ return err;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_xmit);
+
#ifdef CONFIG_PROC_FS
static struct neighbour *neigh_get_first(struct seq_file *seq)
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f123c6c6748c..ee7d1cef0027 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -93,12 +93,18 @@ static u32 dn_neigh_hash(const void *pkey,
return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]);
}
+static bool dn_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq16(neigh, pkey);
+}
+
struct neigh_table dn_neigh_table = {
.family = PF_DECnet,
.entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)),
.key_len = sizeof(__le16),
.protocol = cpu_to_be16(ETH_P_DNA_RT),
.hash = dn_neigh_hash,
+ .key_eq = dn_key_eq,
.constructor = dn_neigh_construct,
.id = "dn_neigh_cache",
.parms ={
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6b8aad6a0d7d..5f5c674e130a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -122,6 +122,7 @@
* Interface to generic neighbour cache.
*/
static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -154,6 +155,7 @@ struct neigh_table arp_tbl = {
.key_len = 4,
.protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
+ .key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
@@ -209,7 +211,12 @@ static u32 arp_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd)
{
- return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq32(neigh, pkey);
}
static int arp_constructor(struct neighbour *neigh)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e363bbc2420d..247ad7c298f7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -84,6 +84,7 @@ do { \
static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
+static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -119,6 +120,7 @@ struct neigh_table nd_tbl = {
.key_len = sizeof(struct in6_addr),
.protocol = cpu_to_be16(ETH_P_IPV6),
.hash = ndisc_hash,
+ .key_eq = ndisc_key_eq,
.constructor = ndisc_constructor,
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
@@ -295,6 +297,11 @@ static u32 ndisc_hash(const void *pkey,
return ndisc_hashfn(pkey, dev, hash_rnd);
}
+static bool ndisc_key_eq(const struct neighbour *n, const void *pkey)
+{
+ return neigh_key_eq128(n, pkey);
+}
+
static int ndisc_constructor(struct neighbour *neigh)
{
struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key;
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 37421db88965..f4286ee7e2b0 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -1,9 +1,30 @@
#
# MPLS configuration
#
+
+menuconfig MPLS
+ tristate "MultiProtocol Label Switching"
+ default n
+ ---help---
+ MultiProtocol Label Switching routes packets through logical
+ circuits. Originally conceved as a way of routing packets at
+ hardware speeds (before hardware was capable of routing ipv4 packets),
+ MPLS remains as simple way of making tunnels.
+
+ If you have not heard of MPLS you probably want to say N here.
+
+if MPLS
+
config NET_MPLS_GSO
- tristate "MPLS: GSO support"
+ bool "MPLS: GSO support"
help
This is helper module to allow segmentation of non-MPLS GSO packets
that have had MPLS stack entries pushed onto them and thus
become MPLS GSO packets.
+
+config MPLS_ROUTING
+ bool "MPLS: routing support"
+ help
+ Add support for forwarding of mpls packets.
+
+endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 6dec088c2d0f..60af15f1960e 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -2,3 +2,4 @@
# Makefile for MPLS.
#
obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
+obj-$(CONFIG_MPLS_ROUTING) += af_mpls.o
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
new file mode 100644
index 000000000000..23e51d13b0ff
--- /dev/null
+++ b/net/mpls/af_mpls.c
@@ -0,0 +1,974 @@
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/mpls.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include "internal.h"
+
+#define LABEL_NOT_SPECIFIED (1<<20)
+#define MAX_NEW_LABELS 2
+
+/* This maximum ha length copied from the definition of struct neighbour */
+#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
+
+struct mpls_route { /* next hop label forwarding entry */
+ struct net_device *rt_dev;
+ struct rcu_head rt_rcu;
+ u32 rt_label[MAX_NEW_LABELS];
+ u8 rt_protocol; /* routing protocol that set this entry */
+ u8 rt_labels:2,
+ rt_via_alen:6;
+ unsigned short rt_via_family;
+ u8 rt_via[0];
+};
+
+static int zero = 0;
+static int label_limit = (1 << 20) - 1;
+
+static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
+ struct nlmsghdr *nlh, struct net *net, u32 portid,
+ unsigned int nlm_flags);
+
+static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
+{
+ struct mpls_route *rt = NULL;
+
+ if (index < net->mpls.platform_labels) {
+ struct mpls_route __rcu **platform_label =
+ rcu_dereference(net->mpls.platform_label);
+ rt = rcu_dereference(platform_label[index]);
+ }
+ return rt;
+}
+
+static bool mpls_output_possible(const struct net_device *dev)
+{
+ return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
+}
+
+static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+{
+ /* The size of the layer 2.5 labels to be added for this route */
+ return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+}
+
+static unsigned int mpls_dev_mtu(const struct net_device *dev)
+{
+ /* The amount of data the layer 2 frame can hold */
+ return dev->mtu;
+}
+
+static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
+ struct mpls_entry_decoded dec)
+{
+ /* RFC4385 and RFC5586 encode other packets in mpls such that
+ * they don't conflict with the ip version number, making
+ * decoding by examining the ip version correct in everything
+ * except for the strangest cases.
+ *
+ * The strange cases if we choose to support them will require
+ * manual configuration.
+ */
+ struct iphdr *hdr4 = ip_hdr(skb);
+ bool success = true;
+
+ if (hdr4->version == 4) {
+ skb->protocol = htons(ETH_P_IP);
+ csum_replace2(&hdr4->check,
+ htons(hdr4->ttl << 8),
+ htons(dec.ttl << 8));
+ hdr4->ttl = dec.ttl;
+ }
+ else if (hdr4->version == 6) {
+ struct ipv6hdr *hdr6 = ipv6_hdr(skb);
+ skb->protocol = htons(ETH_P_IPV6);
+ hdr6->hop_limit = dec.ttl;
+ }
+ else
+ /* version 0 and version 1 are used by pseudo wires */
+ success = false;
+ return success;
+}
+
+static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+ struct mpls_shim_hdr *hdr;
+ struct mpls_route *rt;
+ struct mpls_entry_decoded dec;
+ struct net_device *out_dev;
+ unsigned int hh_len;
+ unsigned int new_header_size;
+ unsigned int mtu;
+ int err;
+
+ /* Careful this entire function runs inside of an rcu critical section */
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr)))
+ goto drop;
+
+ /* Read and decode the label */
+ hdr = mpls_hdr(skb);
+ dec = mpls_entry_decode(hdr);
+
+ /* Pop the label */
+ skb_pull(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+
+ skb_orphan(skb);
+
+ rt = mpls_route_input_rcu(net, dec.label);
+ if (!rt)
+ goto drop;
+
+ /* Find the output device */
+ out_dev = rt->rt_dev;
+ if (!mpls_output_possible(out_dev))
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ /* Verify ttl is valid */
+ if (dec.ttl <= 2)
+ goto drop;
+ dec.ttl -= 1;
+
+ /* Verify the destination can hold the packet */
+ new_header_size = mpls_rt_header_size(rt);
+ mtu = mpls_dev_mtu(out_dev);
+ if (mpls_pkt_too_big(skb, mtu - new_header_size))
+ goto drop;
+
+ hh_len = LL_RESERVED_SPACE(out_dev);
+ if (!out_dev->header_ops)
+ hh_len = 0;
+
+ /* Ensure there is enough space for the headers in the skb */
+ if (skb_cow(skb, hh_len + new_header_size))
+ goto drop;
+
+ skb->dev = out_dev;
+ skb->protocol = htons(ETH_P_MPLS_UC);
+
+ if (unlikely(!new_header_size && dec.bos)) {
+ /* Penultimate hop popping */
+ if (!mpls_egress(rt, skb, dec))
+ goto drop;
+ } else {
+ bool bos;
+ int i;
+ skb_push(skb, new_header_size);
+ skb_reset_network_header(skb);
+ /* Push the new labels */
+ hdr = mpls_hdr(skb);
+ bos = dec.bos;
+ for (i = rt->rt_labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
+ bos = false;
+ }
+ }
+
+ err = neigh_xmit(rt->rt_via_family, out_dev, rt->rt_via, skb);
+ if (err)
+ net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+ __func__, err);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct packet_type mpls_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MPLS_UC),
+ .func = mpls_forward,
+};
+
+static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+};
+
+struct mpls_route_config {
+ u32 rc_protocol;
+ u32 rc_ifindex;
+ u16 rc_via_family;
+ u16 rc_via_alen;
+ u8 rc_via[MAX_VIA_ALEN];
+ u32 rc_label;
+ u32 rc_output_labels;
+ u32 rc_output_label[MAX_NEW_LABELS];
+ u32 rc_nlflags;
+ struct nl_info rc_nlinfo;
+};
+
+static struct mpls_route *mpls_rt_alloc(size_t alen)
+{
+ struct mpls_route *rt;
+
+ rt = kzalloc(GFP_KERNEL, sizeof(*rt) + alen);
+ if (rt)
+ rt->rt_via_alen = alen;
+ return rt;
+}
+
+static void mpls_rt_free(struct mpls_route *rt)
+{
+ if (rt)
+ kfree_rcu(rt, rt_rcu);
+}
+
+static void mpls_notify_route(struct net *net, unsigned index,
+ struct mpls_route *old, struct mpls_route *new,
+ const struct nl_info *info)
+{
+ struct nlmsghdr *nlh = info ? info->nlh : NULL;
+ unsigned portid = info ? info->portid : 0;
+ int event = new ? RTM_NEWROUTE : RTM_DELROUTE;
+ struct mpls_route *rt = new ? new : old;
+ unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0;
+ /* Ignore reserved labels for now */
+ if (rt && (index >= 16))
+ rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags);
+}
+
+static void mpls_route_update(struct net *net, unsigned index,
+ struct net_device *dev, struct mpls_route *new,
+ const struct nl_info *info)
+{
+ struct mpls_route *rt, *old = NULL;
+
+ ASSERT_RTNL();
+
+ rt = net->mpls.platform_label[index];
+ if (!dev || (rt && (rt->rt_dev == dev))) {
+ rcu_assign_pointer(net->mpls.platform_label[index], new);
+ old = rt;
+ }
+
+ mpls_notify_route(net, index, old, new, info);
+
+ /* If we removed a route free it now */
+ mpls_rt_free(old);
+}
+
+static unsigned find_free_label(struct net *net)
+{
+ unsigned index;
+ for (index = 16; index < net->mpls.platform_labels; index++) {
+ if (!net->mpls.platform_label[index])
+ return index;
+ }
+ return LABEL_NOT_SPECIFIED;
+}
+
+static int mpls_route_add(struct mpls_route_config *cfg)
+{
+ struct net *net = cfg->rc_nlinfo.nl_net;
+ struct net_device *dev = NULL;
+ struct mpls_route *rt, *old;
+ unsigned index;
+ int i;
+ int err = -EINVAL;
+
+ index = cfg->rc_label;
+
+ /* If a label was not specified during insert pick one */
+ if ((index == LABEL_NOT_SPECIFIED) &&
+ (cfg->rc_nlflags & NLM_F_CREATE)) {
+ index = find_free_label(net);
+ }
+
+ /* The first 16 labels are reserved, and may not be set */
+ if (index < 16)
+ goto errout;
+
+ /* The full 20 bit range may not be supported. */
+ if (index >= net->mpls.platform_labels)
+ goto errout;
+
+ /* Ensure only a supported number of labels are present */
+ if (cfg->rc_output_labels > MAX_NEW_LABELS)
+ goto errout;
+
+ err = -ENODEV;
+ dev = dev_get_by_index(net, cfg->rc_ifindex);
+ if (!dev)
+ goto errout;
+
+ /* For now just support ethernet devices */
+ err = -EINVAL;
+ if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+ goto errout;
+
+ err = -EINVAL;
+ if ((cfg->rc_via_family == AF_PACKET) &&
+ (dev->addr_len != cfg->rc_via_alen))
+ goto errout;
+
+ /* Append makes no sense with mpls */
+ err = -EINVAL;
+ if (cfg->rc_nlflags & NLM_F_APPEND)
+ goto errout;
+
+ err = -EEXIST;
+ old = net->mpls.platform_label[index];
+ if ((cfg->rc_nlflags & NLM_F_EXCL) && old)
+ goto errout;
+
+ err = -EEXIST;
+ if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old)
+ goto errout;
+
+ err = -ENOENT;
+ if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
+ goto errout;
+
+ err = -ENOMEM;
+ rt = mpls_rt_alloc(cfg->rc_via_alen);
+ if (!rt)
+ goto errout;
+
+ rt->rt_labels = cfg->rc_output_labels;
+ for (i = 0; i < rt->rt_labels; i++)
+ rt->rt_label[i] = cfg->rc_output_label[i];
+ rt->rt_protocol = cfg->rc_protocol;
+ rt->rt_dev = dev;
+ rt->rt_via_family = cfg->rc_via_family;
+ memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
+
+ mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
+
+ dev_put(dev);
+ return 0;
+
+errout:
+ if (dev)
+ dev_put(dev);
+ return err;
+}
+
+static int mpls_route_del(struct mpls_route_config *cfg)
+{
+ struct net *net = cfg->rc_nlinfo.nl_net;
+ unsigned index;
+ int err = -EINVAL;
+
+ index = cfg->rc_label;
+
+ /* The first 16 labels are reserved, and may not be removed */
+ if (index < 16)
+ goto errout;
+
+ /* The full 20 bit range may not be supported */
+ if (index >= net->mpls.platform_labels)
+ goto errout;
+
+ mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
+
+ err = 0;
+errout:
+ return err;
+}
+
+static void mpls_ifdown(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ unsigned index;
+
+ for (index = 0; index < net->mpls.platform_labels; index++) {
+ struct mpls_route *rt = net->mpls.platform_label[index];
+ if (!rt)
+ continue;
+ if (rt->rt_dev != dev)
+ continue;
+ rt->rt_dev = NULL;
+ }
+}
+
+static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch(event) {
+ case NETDEV_UNREGISTER:
+ mpls_ifdown(dev);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mpls_dev_notifier = {
+ .notifier_call = mpls_dev_notify,
+};
+
+static int nla_put_via(struct sk_buff *skb,
+ u16 family, const void *addr, int alen)
+{
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ return -EMSGSIZE;
+
+ via = nla_data(nla);
+ via->rtvia_family = family;
+ memcpy(via->rtvia_addr, addr, alen);
+ return 0;
+}
+
+int nla_put_labels(struct sk_buff *skb, int attrtype,
+ u8 labels, const u32 label[])
+{
+ struct nlattr *nla;
+ struct mpls_shim_hdr *nla_label;
+ bool bos;
+ int i;
+ nla = nla_reserve(skb, attrtype, labels*4);
+ if (!nla)
+ return -EMSGSIZE;
+
+ nla_label = nla_data(nla);
+ bos = true;
+ for (i = labels - 1; i >= 0; i--) {
+ nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos);
+ bos = false;
+ }
+
+ return 0;
+}
+
+int nla_get_labels(const struct nlattr *nla,
+ u32 max_labels, u32 *labels, u32 label[])
+{
+ unsigned len = nla_len(nla);
+ unsigned nla_labels;
+ struct mpls_shim_hdr *nla_label;
+ bool bos;
+ int i;
+
+ /* len needs to be an even multiple of 4 (the label size) */
+ if (len & 3)
+ return -EINVAL;
+
+ /* Limit the number of new labels allowed */
+ nla_labels = len/4;
+ if (nla_labels > max_labels)
+ return -EINVAL;
+
+ nla_label = nla_data(nla);
+ bos = true;
+ for (i = nla_labels - 1; i >= 0; i--, bos = false) {
+ struct mpls_entry_decoded dec;
+ dec = mpls_entry_decode(nla_label + i);
+
+ /* Ensure the bottom of stack flag is properly set
+ * and ttl and tc are both clear.
+ */
+ if ((dec.bos != bos) || dec.ttl || dec.tc)
+ return -EINVAL;
+
+ label[i] = dec.label;
+ }
+ *labels = nla_labels;
+ return 0;
+}
+
+static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct mpls_route_config *cfg)
+{
+ struct rtmsg *rtm;
+ struct nlattr *tb[RTA_MAX+1];
+ int index;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ rtm = nlmsg_data(nlh);
+ memset(cfg, 0, sizeof(*cfg));
+
+ if (rtm->rtm_family != AF_MPLS)
+ goto errout;
+ if (rtm->rtm_dst_len != 20)
+ goto errout;
+ if (rtm->rtm_src_len != 0)
+ goto errout;
+ if (rtm->rtm_tos != 0)
+ goto errout;
+ if (rtm->rtm_table != RT_TABLE_MAIN)
+ goto errout;
+ /* Any value is acceptable for rtm_protocol */
+
+ /* As mpls uses destination specific addresses
+ * (or source specific address in the case of multicast)
+ * all addresses have universal scope.
+ */
+ if (rtm->rtm_scope != RT_SCOPE_UNIVERSE)
+ goto errout;
+ if (rtm->rtm_type != RTN_UNICAST)
+ goto errout;
+ if (rtm->rtm_flags != 0)
+ goto errout;
+
+ cfg->rc_label = LABEL_NOT_SPECIFIED;
+ cfg->rc_protocol = rtm->rtm_protocol;
+ cfg->rc_nlflags = nlh->nlmsg_flags;
+ cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->rc_nlinfo.nlh = nlh;
+ cfg->rc_nlinfo.nl_net = sock_net(skb->sk);
+
+ for (index = 0; index <= RTA_MAX; index++) {
+ struct nlattr *nla = tb[index];
+ if (!nla)
+ continue;
+
+ switch(index) {
+ case RTA_OIF:
+ cfg->rc_ifindex = nla_get_u32(nla);
+ break;
+ case RTA_NEWDST:
+ if (nla_get_labels(nla, MAX_NEW_LABELS,
+ &cfg->rc_output_labels,
+ cfg->rc_output_label))
+ goto errout;
+ break;
+ case RTA_DST:
+ {
+ u32 label_count;
+ if (nla_get_labels(nla, 1, &label_count,
+ &cfg->rc_label))
+ goto errout;
+
+ /* The first 16 labels are reserved, and may not be set */
+ if (cfg->rc_label < 16)
+ goto errout;
+
+ break;
+ }
+ case RTA_VIA:
+ {
+ struct rtvia *via = nla_data(nla);
+ cfg->rc_via_family = via->rtvia_family;
+ cfg->rc_via_alen = nla_len(nla) - 2;
+ if (cfg->rc_via_alen > MAX_VIA_ALEN)
+ goto errout;
+
+ /* Validate the address family */
+ switch(cfg->rc_via_family) {
+ case AF_PACKET:
+ break;
+ case AF_INET:
+ if (cfg->rc_via_alen != 4)
+ goto errout;
+ break;
+ case AF_INET6:
+ if (cfg->rc_via_alen != 16)
+ goto errout;
+ break;
+ default:
+ /* Unsupported address family */
+ goto errout;
+ }
+
+ memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
+ break;
+ }
+ default:
+ /* Unsupported attribute */
+ goto errout;
+ }
+ }
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct mpls_route_config cfg;
+ int err;
+
+ err = rtm_to_route_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ return mpls_route_del(&cfg);
+}
+
+
+static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct mpls_route_config cfg;
+ int err;
+
+ err = rtm_to_route_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ return mpls_route_add(&cfg);
+}
+
+static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 label, struct mpls_route *rt, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_MPLS;
+ rtm->rtm_dst_len = 20;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = RT_TABLE_MAIN;
+ rtm->rtm_protocol = rt->rt_protocol;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_type = RTN_UNICAST;
+ rtm->rtm_flags = 0;
+
+ if (rt->rt_labels &&
+ nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
+ goto nla_put_failure;
+ if (nla_put_via(skb, rt->rt_via_family, rt->rt_via, rt->rt_via_alen))
+ goto nla_put_failure;
+ if (rt->rt_dev && nla_put_u32(skb, RTA_OIF, rt->rt_dev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_labels(skb, RTA_DST, 1, &label))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int index;
+
+ ASSERT_RTNL();
+
+ index = cb->args[0];
+ if (index < 16)
+ index = 16;
+
+ for (; index < net->mpls.platform_labels; index++) {
+ struct mpls_route *rt;
+ rt = net->mpls.platform_label[index];
+ if (!rt)
+ continue;
+
+ if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ index, rt, NLM_F_MULTI) < 0)
+ break;
+ }
+ cb->args[0] = index;
+
+ return skb->len;
+}
+
+static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
+{
+ size_t payload =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */
+ + nla_total_size(4); /* RTA_DST */
+ if (rt->rt_labels) /* RTA_NEWDST */
+ payload += nla_total_size(rt->rt_labels * 4);
+ if (rt->rt_dev) /* RTA_OIF */
+ payload += nla_total_size(4);
+ return payload;
+}
+
+static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
+ struct nlmsghdr *nlh, struct net *net, u32 portid,
+ unsigned int nlm_flags)
+{
+ struct sk_buff *skb;
+ u32 seq = nlh ? nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in lfib_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL);
+
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err);
+}
+
+static int resize_platform_label_table(struct net *net, size_t limit)
+{
+ size_t size = sizeof(struct mpls_route *) * limit;
+ size_t old_limit;
+ size_t cp_size;
+ struct mpls_route __rcu **labels = NULL, **old;
+ struct mpls_route *rt0 = NULL, *rt2 = NULL;
+ unsigned index;
+
+ if (size) {
+ labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (!labels)
+ labels = vzalloc(size);
+
+ if (!labels)
+ goto nolabels;
+ }
+
+ /* In case the predefined labels need to be populated */
+ if (limit > LABEL_IPV4_EXPLICIT_NULL) {
+ struct net_device *lo = net->loopback_dev;
+ rt0 = mpls_rt_alloc(lo->addr_len);
+ if (!rt0)
+ goto nort0;
+ rt0->rt_dev = lo;
+ rt0->rt_protocol = RTPROT_KERNEL;
+ rt0->rt_via_family = AF_PACKET;
+ memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
+ }
+ if (limit > LABEL_IPV6_EXPLICIT_NULL) {
+ struct net_device *lo = net->loopback_dev;
+ rt2 = mpls_rt_alloc(lo->addr_len);
+ if (!rt2)
+ goto nort2;
+ rt2->rt_dev = lo;
+ rt2->rt_protocol = RTPROT_KERNEL;
+ rt2->rt_via_family = AF_PACKET;
+ memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
+ }
+
+ rtnl_lock();
+ /* Remember the original table */
+ old = net->mpls.platform_label;
+ old_limit = net->mpls.platform_labels;
+
+ /* Free any labels beyond the new table */
+ for (index = limit; index < old_limit; index++)
+ mpls_route_update(net, index, NULL, NULL, NULL);
+
+ /* Copy over the old labels */
+ cp_size = size;
+ if (old_limit < limit)
+ cp_size = old_limit * sizeof(struct mpls_route *);
+
+ memcpy(labels, old, cp_size);
+
+ /* If needed set the predefined labels */
+ if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) &&
+ (limit > LABEL_IPV6_EXPLICIT_NULL)) {
+ labels[LABEL_IPV6_EXPLICIT_NULL] = rt2;
+ rt2 = NULL;
+ }
+
+ if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) &&
+ (limit > LABEL_IPV4_EXPLICIT_NULL)) {
+ labels[LABEL_IPV4_EXPLICIT_NULL] = rt0;
+ rt0 = NULL;
+ }
+
+ /* Update the global pointers */
+ net->mpls.platform_labels = limit;
+ net->mpls.platform_label = labels;
+
+ rtnl_unlock();
+
+ mpls_rt_free(rt2);
+ mpls_rt_free(rt0);
+
+ if (old) {
+ synchronize_rcu();
+ kvfree(old);
+ }
+ return 0;
+
+nort2:
+ mpls_rt_free(rt0);
+nort0:
+ kvfree(labels);
+nolabels:
+ return -ENOMEM;
+}
+
+static int mpls_platform_labels(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = table->data;
+ int platform_labels = net->mpls.platform_labels;
+ int ret;
+ struct ctl_table tmp = {
+ .procname = table->procname,
+ .data = &platform_labels,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ .extra1 = &zero,
+ .extra2 = &label_limit,
+ };
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0)
+ ret = resize_platform_label_table(net, platform_labels);
+
+ return ret;
+}
+
+static struct ctl_table mpls_table[] = {
+ {
+ .procname = "platform_labels",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = mpls_platform_labels,
+ },
+ { }
+};
+
+static int mpls_net_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ net->mpls.platform_labels = 0;
+ net->mpls.platform_label = NULL;
+
+ table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL);
+ if (table == NULL)
+ return -ENOMEM;
+
+ table[0].data = net;
+ net->mpls.ctl = register_net_sysctl(net, "net/mpls", table);
+ if (net->mpls.ctl == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void mpls_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+ unsigned int index;
+
+ table = net->mpls.ctl->ctl_table_arg;
+ unregister_net_sysctl_table(net->mpls.ctl);
+ kfree(table);
+
+ /* An rcu grace period haselapsed since there was a device in
+ * the network namespace (and thus the last in fqlight packet)
+ * left this network namespace. This is because
+ * unregister_netdevice_many and netdev_run_todo has completed
+ * for each network device that was in this network namespace.
+ *
+ * As such no additional rcu synchronization is necessary when
+ * freeing the platform_label table.
+ */
+ rtnl_lock();
+ for (index = 0; index < net->mpls.platform_labels; index++) {
+ struct mpls_route *rt = net->mpls.platform_label[index];
+ rcu_assign_pointer(net->mpls.platform_label[index], NULL);
+ mpls_rt_free(rt);
+ }
+ rtnl_unlock();
+
+ kvfree(net->mpls.platform_label);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_net_init,
+ .exit = mpls_net_exit,
+};
+
+static int __init mpls_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4);
+
+ err = register_pernet_subsys(&mpls_net_ops);
+ if (err)
+ goto out;
+
+ err = register_netdevice_notifier(&mpls_dev_notifier);
+ if (err)
+ goto out_unregister_pernet;
+
+ dev_add_pack(&mpls_packet_type);
+
+ rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+ err = 0;
+out:
+ return err;
+
+out_unregister_pernet:
+ unregister_pernet_subsys(&mpls_net_ops);
+ goto out;
+}
+module_init(mpls_init);
+
+static void __exit mpls_exit(void)
+{
+ rtnl_unregister_all(PF_MPLS);
+ dev_remove_pack(&mpls_packet_type);
+ unregister_netdevice_notifier(&mpls_dev_notifier);
+ unregister_pernet_subsys(&mpls_net_ops);
+}
+module_exit(mpls_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_NETPROTO(PF_MPLS);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
new file mode 100644
index 000000000000..fb6de92052c4
--- /dev/null
+++ b/net/mpls/internal.h
@@ -0,0 +1,59 @@
+#ifndef MPLS_INTERNAL_H
+#define MPLS_INTERNAL_H
+
+#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */
+#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */
+#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */
+#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */
+#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */
+#define LABEL_GAL 13 /* RFC5586 */
+#define LABEL_OAM_ALERT 14 /* RFC3429 */
+#define LABEL_EXTENSION 15 /* RFC7274 */
+
+
+struct mpls_shim_hdr {
+ __be32 label_stack_entry;
+};
+
+struct mpls_entry_decoded {
+ u32 label;
+ u8 ttl;
+ u8 tc;
+ u8 bos;
+};
+
+struct sk_buff;
+
+static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
+{
+ return (struct mpls_shim_hdr *)skb_network_header(skb);
+}
+
+static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
+{
+ struct mpls_shim_hdr result;
+ result.label_stack_entry =
+ cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) |
+ (tc << MPLS_LS_TC_SHIFT) |
+ (bos ? (1 << MPLS_LS_S_SHIFT) : 0) |
+ (ttl << MPLS_LS_TTL_SHIFT));
+ return result;
+}
+
+static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr)
+{
+ struct mpls_entry_decoded result;
+ unsigned entry = be32_to_cpu(hdr->label_stack_entry);
+
+ result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
+
+ return result;
+}
+
+int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]);
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]);
+
+#endif /* MPLS_INTERNAL_H */