summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/openvswitch.txt195
-rw-r--r--MAINTAINERS8
-rw-r--r--drivers/infiniband/core/addr.c47
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cm.c15
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c220
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.c14
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c28
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c4
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c4
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c22
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/l2t.c27
-rw-r--r--drivers/net/ethernet/chelsio/cxgb3/l2t.h2
-rw-r--r--drivers/net/ethernet/cisco/enic/enic_main.c2
-rw-r--r--drivers/net/ethernet/sfc/rx.c2
-rw-r--r--drivers/net/ethernet/sfc/selftest.c4
-rw-r--r--drivers/net/ethernet/sfc/tx.c2
-rw-r--r--drivers/net/ethernet/xilinx/ll_temac_main.c2
-rw-r--r--drivers/s390/net/qeth_l3_main.c4
-rw-r--r--drivers/scsi/cxgbi/cxgb3i/cxgb3i.c2
-rw-r--r--drivers/scsi/cxgbi/cxgb4i/cxgb4i.c8
-rw-r--r--drivers/scsi/cxgbi/libcxgbi.c10
-rw-r--r--include/linux/genetlink.h24
-rw-r--r--include/linux/if_vlan.h34
-rw-r--r--include/linux/openvswitch.h452
-rw-r--r--include/linux/skbuff.h11
-rw-r--r--include/net/dst.h6
-rw-r--r--include/net/genetlink.h2
-rw-r--r--include/net/ipv6.h2
-rw-r--r--include/net/ndisc.h9
-rw-r--r--net/8021q/vlan_core.c33
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/clip.c2
-rw-r--r--net/bridge/br_multicast.c3
-rw-r--r--net/bridge/br_netfilter.c2
-rw-r--r--net/bridge/netfilter/ebt_ip6.c3
-rw-r--r--net/bridge/netfilter/ebt_log.c3
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/skbuff.c11
-rw-r--r--net/decnet/dn_neigh.c2
-rw-r--r--net/decnet/dn_route.c8
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/tcp.c7
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_output.c10
-rw-r--r--net/ipv6/addrconf.c2
-rw-r--r--net/ipv6/exthdrs_core.c11
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/ip6_fib.c117
-rw-r--r--net/ipv6/ip6_input.c3
-rw-r--r--net/ipv6/ip6_output.c9
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c3
-rw-r--r--net/ipv6/route.c118
-rw-r--r--net/ipv6/sit.c4
-rw-r--r--net/netfilter/ipset/ip_set_getport.c4
-rw-r--r--net/netfilter/xt_AUDIT.c3
-rw-r--r--net/netfilter/xt_TCPMSS.c3
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c3
-rw-r--r--net/netfilter/xt_hashlimit.c3
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/netlink/genetlink.c21
-rw-r--r--net/openvswitch/Kconfig28
-rw-r--r--net/openvswitch/Makefile14
-rw-r--r--net/openvswitch/actions.c415
-rw-r--r--net/openvswitch/datapath.c1912
-rw-r--r--net/openvswitch/datapath.h125
-rw-r--r--net/openvswitch/dp_notify.c66
-rw-r--r--net/openvswitch/flow.c1346
-rw-r--r--net/openvswitch/flow.h199
-rw-r--r--net/openvswitch/vport-internal_dev.c241
-rw-r--r--net/openvswitch/vport-internal_dev.h28
-rw-r--r--net/openvswitch/vport-netdev.c198
-rw-r--r--net/openvswitch/vport-netdev.h42
-rw-r--r--net/openvswitch/vport.c396
-rw-r--r--net/openvswitch/vport.h205
-rw-r--r--net/sched/sch_teql.c2
-rw-r--r--net/xfrm/xfrm_policy.c2
-rw-r--r--security/lsm_audit.c3
-rw-r--r--security/selinux/hooks.c3
85 files changed, 6363 insertions, 445 deletions
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index bbce1215434a..9ad9ddeb384c 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -144,6 +144,8 @@ nfc.txt
- The Linux Near Field Communication (NFS) subsystem.
olympic.txt
- IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info.
+openvswitch.txt
+ - Open vSwitch developer documentation.
operstates.txt
- Overview of network interface operational states.
packet_mmap.txt
diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt
new file mode 100644
index 000000000000..b8a048b8df3a
--- /dev/null
+++ b/Documentation/networking/openvswitch.txt
@@ -0,0 +1,195 @@
+Open vSwitch datapath developer documentation
+=============================================
+
+The Open vSwitch kernel module allows flexible userspace control over
+flow-level packet processing on selected network devices. It can be
+used to implement a plain Ethernet switch, network device bonding,
+VLAN processing, network access control, flow-based network control,
+and so on.
+
+The kernel module implements multiple "datapaths" (analogous to
+bridges), each of which can have multiple "vports" (analogous to ports
+within a bridge). Each datapath also has associated with it a "flow
+table" that userspace populates with "flows" that map from keys based
+on packet headers and metadata to sets of actions. The most common
+action forwards the packet to another vport; other actions are also
+implemented.
+
+When a packet arrives on a vport, the kernel module processes it by
+extracting its flow key and looking it up in the flow table. If there
+is a matching flow, it executes the associated actions. If there is
+no match, it queues the packet to userspace for processing (as part of
+its processing, userspace will likely set up a flow to handle further
+packets of the same type entirely in-kernel).
+
+
+Flow key compatibility
+----------------------
+
+Network protocols evolve over time. New protocols become important
+and existing protocols lose their prominence. For the Open vSwitch
+kernel module to remain relevant, it must be possible for newer
+versions to parse additional protocols as part of the flow key. It
+might even be desirable, someday, to drop support for parsing
+protocols that have become obsolete. Therefore, the Netlink interface
+to Open vSwitch is designed to allow carefully written userspace
+applications to work with any version of the flow key, past or future.
+
+To support this forward and backward compatibility, whenever the
+kernel module passes a packet to userspace, it also passes along the
+flow key that it parsed from the packet. Userspace then extracts its
+own notion of a flow key from the packet and compares it against the
+kernel-provided version:
+
+ - If userspace's notion of the flow key for the packet matches the
+ kernel's, then nothing special is necessary.
+
+ - If the kernel's flow key includes more fields than the userspace
+ version of the flow key, for example if the kernel decoded IPv6
+ headers but userspace stopped at the Ethernet type (because it
+ does not understand IPv6), then again nothing special is
+ necessary. Userspace can still set up a flow in the usual way,
+ as long as it uses the kernel-provided flow key to do it.
+
+ - If the userspace flow key includes more fields than the
+ kernel's, for example if userspace decoded an IPv6 header but
+ the kernel stopped at the Ethernet type, then userspace can
+ forward the packet manually, without setting up a flow in the
+ kernel. This case is bad for performance because every packet
+ that the kernel considers part of the flow must go to userspace,
+ but the forwarding behavior is correct. (If userspace can
+ determine that the values of the extra fields would not affect
+ forwarding behavior, then it could set up a flow anyway.)
+
+How flow keys evolve over time is important to making this work, so
+the following sections go into detail.
+
+
+Flow key format
+---------------
+
+A flow key is passed over a Netlink socket as a sequence of Netlink
+attributes. Some attributes represent packet metadata, defined as any
+information about a packet that cannot be extracted from the packet
+itself, e.g. the vport on which the packet was received. Most
+attributes, however, are extracted from headers within the packet,
+e.g. source and destination addresses from Ethernet, IP, or TCP
+headers.
+
+The <linux/openvswitch.h> header file defines the exact format of the
+flow key attributes. For informal explanatory purposes here, we write
+them as comma-separated strings, with parentheses indicating arguments
+and nesting. For example, the following could represent a flow key
+corresponding to a TCP packet that arrived on vport 1:
+
+ in_port(1), eth(src=e0:91:f5:21:d0:b2, dst=00:02:e3:0f:80:a4),
+ eth_type(0x0800), ipv4(src=172.16.0.20, dst=172.18.0.52, proto=17, tos=0,
+ frag=no), tcp(src=49163, dst=80)
+
+Often we ellipsize arguments not important to the discussion, e.g.:
+
+ in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...)
+
+
+Basic rule for evolving flow keys
+---------------------------------
+
+Some care is needed to really maintain forward and backward
+compatibility for applications that follow the rules listed under
+"Flow key compatibility" above.
+
+The basic rule is obvious:
+
+ ------------------------------------------------------------------
+ New network protocol support must only supplement existing flow
+ key attributes. It must not change the meaning of already defined
+ flow key attributes.
+ ------------------------------------------------------------------
+
+This rule does have less-obvious consequences so it is worth working
+through a few examples. Suppose, for example, that the kernel module
+did not already implement VLAN parsing. Instead, it just interpreted
+the 802.1Q TPID (0x8100) as the Ethertype then stopped parsing the
+packet. The flow key for any packet with an 802.1Q header would look
+essentially like this, ignoring metadata:
+
+ eth(...), eth_type(0x8100)
+
+Naively, to add VLAN support, it makes sense to add a new "vlan" flow
+key attribute to contain the VLAN tag, then continue to decode the
+encapsulated headers beyond the VLAN tag using the existing field
+definitions. With this change, an TCP packet in VLAN 10 would have a
+flow key much like this:
+
+ eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...)
+
+But this change would negatively affect a userspace application that
+has not been updated to understand the new "vlan" flow key attribute.
+The application could, following the flow compatibility rules above,
+ignore the "vlan" attribute that it does not understand and therefore
+assume that the flow contained IP packets. This is a bad assumption
+(the flow only contains IP packets if one parses and skips over the
+802.1Q header) and it could cause the application's behavior to change
+across kernel versions even though it follows the compatibility rules.
+
+The solution is to use a set of nested attributes. This is, for
+example, why 802.1Q support uses nested attributes. A TCP packet in
+VLAN 10 is actually expressed as:
+
+ eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800),
+ ip(proto=6, ...), tcp(...)))
+
+Notice how the "eth_type", "ip", and "tcp" flow key attributes are
+nested inside the "encap" attribute. Thus, an application that does
+not understand the "vlan" key will not see either of those attributes
+and therefore will not misinterpret them. (Also, the outer eth_type
+is still 0x8100, not changed to 0x0800.)
+
+Handling malformed packets
+--------------------------
+
+Don't drop packets in the kernel for malformed protocol headers, bad
+checksums, etc. This would prevent userspace from implementing a
+simple Ethernet switch that forwards every packet.
+
+Instead, in such a case, include an attribute with "empty" content.
+It doesn't matter if the empty content could be valid protocol values,
+as long as those values are rarely seen in practice, because userspace
+can always forward all packets with those values to userspace and
+handle them individually.
+
+For example, consider a packet that contains an IP header that
+indicates protocol 6 for TCP, but which is truncated just after the IP
+header, so that the TCP header is missing. The flow key for this
+packet would include a tcp attribute with all-zero src and dst, like
+this:
+
+ eth(...), eth_type(0x0800), ip(proto=6, ...), tcp(src=0, dst=0)
+
+As another example, consider a packet with an Ethernet type of 0x8100,
+indicating that a VLAN TCI should follow, but which is truncated just
+after the Ethernet type. The flow key for this packet would include
+an all-zero-bits vlan and an empty encap attribute, like this:
+
+ eth(...), eth_type(0x8100), vlan(0), encap()
+
+Unlike a TCP packet with source and destination ports 0, an
+all-zero-bits VLAN TCI is not that rare, so the CFI bit (aka
+VLAN_TAG_PRESENT inside the kernel) is ordinarily set in a vlan
+attribute expressly to allow this situation to be distinguished.
+Thus, the flow key in this second example unambiguously indicates a
+missing or malformed VLAN TCI.
+
+Other rules
+-----------
+
+The other rules for flow keys are much less subtle:
+
+ - Duplicate attributes are not allowed at a given nesting level.
+
+ - Ordering of attributes is not significant.
+
+ - When the kernel sends a given flow key to userspace, it always
+ composes it the same way. This allows userspace to hash and
+ compare entire flow keys that it may not be able to fully
+ interpret.
diff --git a/MAINTAINERS b/MAINTAINERS
index c88eb7bb3a69..209ad0695ba2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4868,6 +4868,14 @@ S: Maintained
T: git git://openrisc.net/~jonas/linux
F: arch/openrisc
+OPENVSWITCH
+M: Jesse Gross <jesse@nicira.com>
+L: dev@openvswitch.org
+W: http://openvswitch.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/jesse/openvswitch.git
+S: Maintained
+F: net/openvswitch/
+
OPL4 DRIVER
M: Clemens Ladisch <clemens@ladisch.de>
L: alsa-devel@alsa-project.org (moderated for non-subscribers)
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index a20c3c8224ea..1612cfd50f39 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -178,6 +178,25 @@ static void queue_req(struct addr_req *req)
mutex_unlock(&lock);
}
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *addr)
+{
+ struct neighbour *n;
+ int ret;
+
+ rcu_read_lock();
+ n = dst_get_neighbour_noref(dst);
+ if (!n || !(n->nud_state & NUD_VALID)) {
+ if (n)
+ neigh_event_send(n, NULL);
+ ret = -ENODATA;
+ } else {
+ ret = rdma_copy_addr(addr, dst->dev, n->ha);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
static int addr4_resolve(struct sockaddr_in *src_in,
struct sockaddr_in *dst_in,
struct rdma_dev_addr *addr)
@@ -185,7 +204,6 @@ static int addr4_resolve(struct sockaddr_in *src_in,
__be32 src_ip = src_in->sin_addr.s_addr;
__be32 dst_ip = dst_in->sin_addr.s_addr;
struct rtable *rt;
- struct neighbour *neigh;
struct flowi4 fl4;
int ret;
@@ -214,20 +232,7 @@ static int addr4_resolve(struct sockaddr_in *src_in,
goto put;
}
- neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->dst.dev);
- if (!neigh || !(neigh->nud_state & NUD_VALID)) {
- rcu_read_lock();
- neigh_event_send(dst_get_neighbour(&rt->dst), NULL);
- rcu_read_unlock();
- ret = -ENODATA;
- if (neigh)
- goto release;
- goto put;
- }
-
- ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
-release:
- neigh_release(neigh);
+ ret = dst_fetch_ha(&rt->dst, addr);
put:
ip_rt_put(rt);
out:
@@ -240,7 +245,6 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
struct rdma_dev_addr *addr)
{
struct flowi6 fl6;
- struct neighbour *neigh;
struct dst_entry *dst;
int ret;
@@ -276,16 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
goto put;
}
- rcu_read_lock();
- neigh = dst_get_neighbour(dst);
- if (!neigh || !(neigh->nud_state & NUD_VALID)) {
- if (neigh)
- neigh_event_send(neigh, NULL);
- ret = -ENODATA;
- } else {
- ret = rdma_copy_addr(addr, dst->dev, neigh->ha);
- }
- rcu_read_unlock();
+ ret = dst_fetch_ha(dst, addr);
put:
dst_release(dst);
return ret;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index c88b12beef25..740dcc065cf2 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -1338,7 +1338,6 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
struct iwch_ep *child_ep, *parent_ep = ctx;
struct cpl_pass_accept_req *req = cplhdr(skb);
unsigned int hwtid = GET_TID(req);
- struct neighbour *neigh;
struct dst_entry *dst;
struct l2t_entry *l2t;
struct rtable *rt;
@@ -1375,10 +1374,7 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
goto reject;
}
dst = &rt->dst;
- rcu_read_lock();
- neigh = dst_get_neighbour(dst);
- l2t = t3_l2t_get(tdev, neigh, neigh->dev);
- rcu_read_unlock();
+ l2t = t3_l2t_get(tdev, dst, NULL);
if (!l2t) {
printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
__func__);
@@ -1889,7 +1885,6 @@ static int is_loopback_dst(struct iw_cm_id *cm_id)
int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
{
struct iwch_dev *h = to_iwch_dev(cm_id->device);
- struct neighbour *neigh;
struct iwch_ep *ep;
struct rtable *rt;
int err = 0;
@@ -1947,13 +1942,7 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
goto fail3;
}
ep->dst = &rt->dst;
-
- rcu_read_lock();
- neigh = dst_get_neighbour(ep->dst);
-
- /* get a l2t entry */
- ep->l2t = t3_l2t_get(ep->com.tdev, neigh, neigh->dev);
- rcu_read_unlock();
+ ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL);
if (!ep->l2t) {
printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
err = -ENOMEM;
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 0747004313ad..0668bb3472d0 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1556,6 +1556,67 @@ static void get_4tuple(struct cpl_pass_accept_req *req,
return;
}
+static int import_ep(struct c4iw_ep *ep, __be32 peer_ip, struct dst_entry *dst,
+ struct c4iw_dev *cdev, bool clear_mpa_v1)
+{
+ struct neighbour *n;
+ int err, step;
+
+ rcu_read_lock();
+ n = dst_get_neighbour_noref(dst);
+ err = -ENODEV;
+ if (!n)
+ goto out;
+ err = -ENOMEM;
+ if (n->dev->flags & IFF_LOOPBACK) {
+ struct net_device *pdev;
+
+ pdev = ip_dev_find(&init_net, peer_ip);
+ ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+ n, pdev, 0);
+ if (!ep->l2t)
+ goto out;
+ ep->mtu = pdev->mtu;
+ ep->tx_chan = cxgb4_port_chan(pdev);
+ ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+ step = cdev->rdev.lldi.ntxq /
+ cdev->rdev.lldi.nchan;
+ ep->txq_idx = cxgb4_port_idx(pdev) * step;
+ step = cdev->rdev.lldi.nrxq /
+ cdev->rdev.lldi.nchan;
+ ep->ctrlq_idx = cxgb4_port_idx(pdev);
+ ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+ cxgb4_port_idx(pdev) * step];
+ dev_put(pdev);
+ } else {
+ ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+ n, n->dev, 0);
+ if (!ep->l2t)
+ goto out;
+ ep->mtu = dst_mtu(ep->dst);
+ ep->tx_chan = cxgb4_port_chan(n->dev);
+ ep->smac_idx = (cxgb4_port_viid(n->dev) & 0x7F) << 1;
+ step = cdev->rdev.lldi.ntxq /
+ cdev->rdev.lldi.nchan;
+ ep->txq_idx = cxgb4_port_idx(n->dev) * step;
+ ep->ctrlq_idx = cxgb4_port_idx(n->dev);
+ step = cdev->rdev.lldi.nrxq /
+ cdev->rdev.lldi.nchan;
+ ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+ cxgb4_port_idx(n->dev) * step];
+
+ if (clear_mpa_v1) {
+ ep->retry_with_mpa_v1 = 0;
+ ep->tried_with_mpa_v1 = 0;
+ }
+ }
+ err = 0;
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct c4iw_ep *child_ep, *parent_ep;
@@ -1563,18 +1624,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid));
struct tid_info *t = dev->rdev.lldi.tids;
unsigned int hwtid = GET_TID(req);
- struct neighbour *neigh;
struct dst_entry *dst;
- struct l2t_entry *l2t;
struct rtable *rt;
__be32 local_ip, peer_ip;
__be16 local_port, peer_port;
- struct net_device *pdev;
- u32 tx_chan, smac_idx;
- u16 rss_qid;
- u32 mtu;
- int step;
- int txq_idx, ctrlq_idx;
+ int err;
parent_ep = lookup_stid(t, stid);
PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
@@ -1596,49 +1650,24 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
goto reject;
}
dst = &rt->dst;
- rcu_read_lock();
- neigh = dst_get_neighbour(dst);
- if (neigh->dev->flags & IFF_LOOPBACK) {
- pdev = ip_dev_find(&init_net, peer_ip);
- BUG_ON(!pdev);
- l2t = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, pdev, 0);
- mtu = pdev->mtu;
- tx_chan = cxgb4_port_chan(pdev);
- smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
- step = dev->rdev.lldi.ntxq / dev->rdev.lldi.nchan;
- txq_idx = cxgb4_port_idx(pdev) * step;
- ctrlq_idx = cxgb4_port_idx(pdev);
- step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
- rss_qid = dev->rdev.lldi.rxq_ids[cxgb4_port_idx(pdev) * step];
- dev_put(pdev);
- } else {
- l2t = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, neigh->dev, 0);
- mtu = dst_mtu(dst);
- tx_chan = cxgb4_port_chan(neigh->dev);
- smac_idx = (cxgb4_port_viid(neigh->dev) & 0x7F) << 1;
- step = dev->rdev.lldi.ntxq / dev->rdev.lldi.nchan;
- txq_idx = cxgb4_port_idx(neigh->dev) * step;
- ctrlq_idx = cxgb4_port_idx(neigh->dev);
- step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
- rss_qid = dev->rdev.lldi.rxq_ids[
- cxgb4_port_idx(neigh->dev) * step];
- }
- rcu_read_unlock();
- if (!l2t) {
- printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+
+ child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+ if (!child_ep) {
+ printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
__func__);
dst_release(dst);
goto reject;
}
- child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
- if (!child_ep) {
- printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+ err = import_ep(child_ep, peer_ip, dst, dev, false);
+ if (err) {
+ printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
__func__);
- cxgb4_l2t_release(l2t);
dst_release(dst);
+ kfree(child_ep);
goto reject;
}
+
state_set(&child_ep->com, CONNECTING);
child_ep->com.dev = dev;
child_ep->com.cm_id = NULL;
@@ -1651,18 +1680,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
c4iw_get_ep(&parent_ep->com);
child_ep->parent_ep = parent_ep;
child_ep->tos = GET_POPEN_TOS(ntohl(req->tos_stid));
- child_ep->l2t = l2t;
child_ep->dst = dst;
child_ep->hwtid = hwtid;
- child_ep->tx_chan = tx_chan;
- child_ep->smac_idx = smac_idx;
- child_ep->rss_qid = rss_qid;
- child_ep->mtu = mtu;
- child_ep->txq_idx = txq_idx;
- child_ep->ctrlq_idx = ctrlq_idx;
PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__,
- tx_chan, smac_idx, rss_qid);
+ child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid);
init_timer(&child_ep->timer);
cxgb4_insert_tid(t, child_ep, hwtid);
@@ -1792,11 +1814,8 @@ static int is_neg_adv_abort(unsigned int status)
static int c4iw_reconnect(struct c4iw_ep *ep)
{
- int err = 0;
struct rtable *rt;
- struct net_device *pdev;
- struct neighbour *neigh;
- int step;
+ int err = 0;
PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
init_timer(&ep->timer);
@@ -1824,47 +1843,10 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
}
ep->dst = &rt->dst;
- rcu_read_lock();
- neigh = dst_get_neighbour(ep->dst);
-
- /* get a l2t entry */
- if (neigh->dev->flags & IFF_LOOPBACK) {
- PDBG("%s LOOPBACK\n", __func__);
- pdev = ip_dev_find(&init_net,
- ep->com.cm_id->remote_addr.sin_addr.s_addr);
- ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
- neigh, pdev, 0);
- ep->mtu = pdev->mtu;
- ep->tx_chan = cxgb4_port_chan(pdev);
- ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
- step = ep->com.dev->rdev.lldi.ntxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->txq_idx = cxgb4_port_idx(pdev) * step;
- step = ep->com.dev->rdev.lldi.nrxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->ctrlq_idx = cxgb4_port_idx(pdev);
- ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
- cxgb4_port_idx(pdev) * step];
- dev_put(pdev);
- } else {
- ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
- neigh, neigh->dev, 0);
- ep->mtu = dst_mtu(ep->dst);
- ep->tx_chan = cxgb4_port_chan(neigh->dev);
- ep->smac_idx = (cxgb4_port_viid(neigh->dev) & 0x7F) << 1;
- step = ep->com.dev->rdev.lldi.ntxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->txq_idx = cxgb4_port_idx(neigh->dev) * step;
- ep->ctrlq_idx = cxgb4_port_idx(neigh->dev);
- step = ep->com.dev->rdev.lldi.nrxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
- cxgb4_port_idx(neigh->dev) * step];
- }
- rcu_read_unlock();
- if (!ep->l2t) {
+ err = import_ep(ep, ep->com.cm_id->remote_addr.sin_addr.s_addr,
+ ep->dst, ep->com.dev, false);
+ if (err) {
printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
- err = -ENOMEM;
goto fail4;
}
@@ -2240,13 +2222,10 @@ err:
int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
{
- int err = 0;
struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
struct c4iw_ep *ep;
struct rtable *rt;
- struct net_device *pdev;
- struct neighbour *neigh;
- int step;
+ int err = 0;
if ((conn_param->ord > c4iw_max_read_depth) ||
(conn_param->ird > c4iw_max_read_depth)) {
@@ -2307,49 +2286,10 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
}
ep->dst = &rt->dst;
- rcu_read_lock();
- neigh = dst_get_neighbour(ep->dst);
-
- /* get a l2t entry */
- if (neigh->dev->flags & IFF_LOOPBACK) {
- PDBG("%s LOOPBACK\n", __func__);
- pdev = ip_dev_find(&init_net,
- cm_id->remote_addr.sin_addr.s_addr);
- ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
- neigh, pdev, 0);
- ep->mtu = pdev->mtu;
- ep->tx_chan = cxgb4_port_chan(pdev);
- ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
- step = ep->com.dev->rdev.lldi.ntxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->txq_idx = cxgb4_port_idx(pdev) * step;
- step = ep->com.dev->rdev.lldi.nrxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->ctrlq_idx = cxgb4_port_idx(pdev);
- ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
- cxgb4_port_idx(pdev) * step];
- dev_put(pdev);
- } else {
- ep->l2t = cxgb4_l2t_get(ep->com.dev->rdev.lldi.l2t,
- neigh, neigh->dev, 0);
- ep->mtu = dst_mtu(ep->dst);
- ep->tx_chan = cxgb4_port_chan(neigh->dev);
- ep->smac_idx = (cxgb4_port_viid(neigh->dev) & 0x7F) << 1;
- step = ep->com.dev->rdev.lldi.ntxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->txq_idx = cxgb4_port_idx(neigh->dev) * step;
- ep->ctrlq_idx = cxgb4_port_idx(neigh->dev);
- step = ep->com.dev->rdev.lldi.nrxq /
- ep->com.dev->rdev.lldi.nchan;
- ep->rss_qid = ep->com.dev->rdev.lldi.rxq_ids[
- cxgb4_port_idx(neigh->dev) * step];
- ep->retry_with_mpa_v1 = 0;
- ep->tried_with_mpa_v1 = 0;
- }
- rcu_read_unlock();
- if (!ep->l2t) {
+ err = import_ep(ep, cm_id->remote_addr.sin_addr.s_addr,
+ ep->dst, ep->com.dev, true);
+ if (err) {
printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
- err = -ENOMEM;
goto fail4;
}
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 0a52d72371ee..b1e6cae5f47e 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -1348,7 +1348,8 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi
else
netdev = nesvnic->netdev;
- neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev);
+ rcu_read_lock();
+ neigh = dst_get_neighbour_noref(&rt->dst);
if (neigh) {
if (neigh->nud_state & NUD_VALID) {
nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
@@ -1359,7 +1360,6 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi
if (!memcmp(nesadapter->arp_table[arpindex].mac_addr,
neigh->ha, ETH_ALEN)) {
/* Mac address same as in nes_arp_table */
- neigh_release(neigh);
ip_rt_put(rt);
return rc;
}
@@ -1373,15 +1373,11 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi
dst_ip, NES_ARP_ADD);
rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
NES_ARP_RESOLVE);
+ } else {
+ neigh_event_send(neigh, NULL);
}
- neigh_release(neigh);
- }
-
- if ((neigh == NULL) || (!(neigh->nud_state & NUD_VALID))) {
- rcu_read_lock();
- neigh_event_send(dst_get_neighbour(&rt->dst), NULL);
- rcu_read_unlock();
}
+ rcu_read_unlock();
ip_rt_put(rt);
return rc;
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index d3ed89ca4852..3514ca05deea 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -556,15 +556,13 @@ static int path_rec_start(struct net_device *dev,
}
/* called with rcu_read_lock */
-static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
+static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path;
struct ipoib_neigh *neigh;
- struct neighbour *n;
unsigned long flags;
- n = dst_get_neighbour(skb_dst(skb));
neigh = ipoib_neigh_alloc(n, skb->dev);
if (!neigh) {
++dev->stats.tx_dropped;
@@ -638,16 +636,13 @@ err_drop:
}
/* called with rcu_read_lock */
-static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
+static void ipoib_path_lookup(struct sk_buff *skb, struct neighbour *n, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n;
/* Look up path record for unicasts */
- n = dst_get_neighbour(dst);
if (n->ha[4] != 0xff) {
- neigh_add_path(skb, dev);
+ neigh_add_path(skb, n, dev);
return;
}
@@ -723,12 +718,17 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
unsigned long flags;
rcu_read_lock();
- if (likely(skb_dst(skb)))
- n = dst_get_neighbour(skb_dst(skb));
-
+ if (likely(skb_dst(skb))) {
+ n = dst_get_neighbour_noref(skb_dst(skb));
+ if (!n) {
+ ++dev->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ goto unlock;
+ }
+ }
if (likely(n)) {
if (unlikely(!*to_ipoib_neigh(n))) {
- ipoib_path_lookup(skb, dev);
+ ipoib_path_lookup(skb, n, dev);
goto unlock;
}
@@ -751,7 +751,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
list_del(&neigh->list);
ipoib_neigh_free(dev, neigh);
spin_unlock_irqrestore(&priv->lock, flags);
- ipoib_path_lookup(skb, dev);
+ ipoib_path_lookup(skb, n, dev);
goto unlock;
}
@@ -841,7 +841,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
dst = skb_dst(skb);
n = NULL;
if (dst)
- n = dst_get_neighbour_raw(dst);
+ n = dst_get_neighbour_noref_raw(dst);
if ((!dst || !n) && daddr) {
struct ipoib_pseudoheader *phdr =
(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 873bff97e69e..f7ff9dd66cda 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -269,7 +269,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
skb->dev = dev;
if (dst)
- n = dst_get_neighbour_raw(dst);
+ n = dst_get_neighbour_noref_raw(dst);
if (!dst || !n) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof (struct ipoib_pseudoheader));
@@ -728,7 +728,7 @@ out:
rcu_read_lock();
if (dst)
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
if (n && !*to_ipoib_neigh(n)) {
struct ipoib_neigh *neigh = ipoib_neigh_alloc(n,
skb->dev);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 79695bb034d6..477bc9713a66 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3300,14 +3300,14 @@ int __devinit bnx2x_alloc_mem_bp(struct bnx2x *bp)
msix_table_size = bp->igu_sb_cnt + 1;
/* fp array: RSS plus CNIC related L2 queues */
- fp = kzalloc((BNX2X_MAX_RSS_COUNT(bp) + NON_ETH_CONTEXT_USE) *
+ fp = kcalloc(BNX2X_MAX_RSS_COUNT(bp) + NON_ETH_CONTEXT_USE,
sizeof(*fp), GFP_KERNEL);
if (!fp)
goto alloc_err;
bp->fp = fp;
/* msix table */
- tbl = kzalloc(msix_table_size * sizeof(*tbl), GFP_KERNEL);
+ tbl = kcalloc(msix_table_size, sizeof(*tbl), GFP_KERNEL);
if (!tbl)
goto alloc_err;
bp->msix_table = tbl;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index 14517691f8db..a34362e9fd9c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -3342,7 +3342,7 @@ static inline int bnx2x_mcast_refresh_registry_e1(struct bnx2x *bp,
if (!list_empty(&o->registry.exact_match.macs))
return 0;
- elem = kzalloc(sizeof(*elem)*len, GFP_ATOMIC);
+ elem = kcalloc(len, sizeof(*elem), GFP_ATOMIC);
if (!elem) {
BNX2X_ERR("Failed to allocate registry memory\n");
return -ENOMEM;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
index 7f7882d24bc6..65e4b280619a 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -969,7 +969,7 @@ static int nb_callback(struct notifier_block *self, unsigned long event,
case (NETEVENT_REDIRECT):{
struct netevent_redirect *nr = ctx;
cxgb_redirect(nr->old, nr->new);
- cxgb_neigh_update(dst_get_neighbour(nr->new));
+ cxgb_neigh_update(dst_get_neighbour_noref(nr->new));
break;
}
default:
@@ -1072,8 +1072,11 @@ static int is_offloading(struct net_device *dev)
static void cxgb_neigh_update(struct neighbour *neigh)
{
- struct net_device *dev = neigh->dev;
+ struct net_device *dev;
+ if (!neigh)
+ return;
+ dev = neigh->dev;
if (dev && (is_offloading(dev))) {
struct t3cdev *tdev = dev2t3cdev(dev);
@@ -1107,6 +1110,7 @@ static void set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e)
static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new)
{
struct net_device *olddev, *newdev;
+ struct neighbour *n;
struct tid_info *ti;
struct t3cdev *tdev;
u32 tid;
@@ -1114,8 +1118,16 @@ static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new)
struct l2t_entry *e;
struct t3c_tid_entry *te;
- olddev = dst_get_neighbour(old)->dev;
- newdev = dst_get_neighbour(new)->dev;
+ n = dst_get_neighbour_noref(old);
+ if (!n)
+ return;
+ olddev = n->dev;
+
+ n = dst_get_neighbour_noref(new);
+ if (!n)
+ return;
+ newdev = n->dev;
+
if (!is_offloading(olddev))
return;
if (!is_offloading(newdev)) {
@@ -1132,7 +1144,7 @@ static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new)
}
/* Add new L2T entry */
- e = t3_l2t_get(tdev, dst_get_neighbour(new), newdev);
+ e = t3_l2t_get(tdev, new, newdev);
if (!e) {
printk(KERN_ERR "%s: couldn't allocate new l2t entry!\n",
__func__);
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
index 70fec8b1140f..3fa3c8833ed7 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
@@ -298,18 +298,31 @@ static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
spin_unlock(&e->lock);
}
-struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh,
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
struct net_device *dev)
{
struct l2t_entry *e = NULL;
+ struct neighbour *neigh;
+ struct port_info *p;
struct l2t_data *d;
int hash;
- u32 addr = *(u32 *) neigh->primary_key;
- int ifidx = neigh->dev->ifindex;
- struct port_info *p = netdev_priv(dev);
- int smt_idx = p->port_id;
+ u32 addr;
+ int ifidx;
+ int smt_idx;
rcu_read_lock();
+ neigh = dst_get_neighbour_noref(dst);
+ if (!neigh)
+ goto done_rcu;
+
+ addr = *(u32 *) neigh->primary_key;
+ ifidx = neigh->dev->ifindex;
+
+ if (!dev)
+ dev = neigh->dev;
+ p = netdev_priv(dev);
+ smt_idx = p->port_id;
+
d = L2DATA(cdev);
if (!d)
goto done_rcu;
@@ -323,7 +336,7 @@ struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh,
l2t_hold(d, e);
if (atomic_read(&e->refcnt) == 1)
reuse_entry(e, neigh);
- goto done;
+ goto done_unlock;
}
/* Need to allocate a new entry */
@@ -344,7 +357,7 @@ struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh,
e->vlan = VLAN_NONE;
spin_unlock(&e->lock);
}
-done:
+done_unlock:
write_unlock_bh(&d->lock);
done_rcu:
rcu_read_unlock();
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.h b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
index c5f54796e2cb..c4e864369751 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/l2t.h
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
@@ -109,7 +109,7 @@ static inline void set_arp_failure_handler(struct sk_buff *skb,
void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh);
-struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh,
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
struct net_device *dev);
int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
struct l2t_entry *e);
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 1fe5df0284a6..2fd9db4b1be5 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2379,7 +2379,7 @@ static int __devinit enic_probe(struct pci_dev *pdev,
#endif
/* Allocate structure for port profiles */
- enic->pp = kzalloc(num_pps * sizeof(*enic->pp), GFP_KERNEL);
+ enic->pp = kcalloc(num_pps, sizeof(*enic->pp), GFP_KERNEL);
if (!enic->pp) {
pr_err("port profile alloc failed, aborting\n");
err = -ENOMEM;
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 752d521c09b1..955b14956deb 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -669,7 +669,7 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
rx_queue->ptr_mask);
/* Allocate RX buffers */
- rx_queue->buffer = kzalloc(entries * sizeof(*rx_queue->buffer),
+ rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer),
GFP_KERNEL);
if (!rx_queue->buffer)
return -ENOMEM;
diff --git a/drivers/net/ethernet/sfc/selftest.c b/drivers/net/ethernet/sfc/selftest.c
index 822f6c2a6a7c..52edd24fcde3 100644
--- a/drivers/net/ethernet/sfc/selftest.c
+++ b/drivers/net/ethernet/sfc/selftest.c
@@ -503,8 +503,8 @@ efx_test_loopback(struct efx_tx_queue *tx_queue,
/* Determine how many packets to send */
state->packet_count = efx->txq_entries / 3;
state->packet_count = min(1 << (i << 2), state->packet_count);
- state->skbs = kzalloc(sizeof(state->skbs[0]) *
- state->packet_count, GFP_KERNEL);
+ state->skbs = kcalloc(state->packet_count,
+ sizeof(state->skbs[0]), GFP_KERNEL);
if (!state->skbs)
return -ENOMEM;
state->flush = false;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index e0e00b3d6a82..72f0fbc73b1a 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -479,7 +479,7 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask);
/* Allocate software ring */
- tx_queue->buffer = kzalloc(entries * sizeof(*tx_queue->buffer),
+ tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer),
GFP_KERNEL);
if (!tx_queue->buffer)
return -ENOMEM;
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 282330d9801b..903a77b416df 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -237,7 +237,7 @@ static int temac_dma_bd_init(struct net_device *ndev)
struct sk_buff *skb;
int i;
- lp->rx_skb = kzalloc(sizeof(*lp->rx_skb) * RX_BD_NUM, GFP_KERNEL);
+ lp->rx_skb = kcalloc(RX_BD_NUM, sizeof(*lp->rx_skb), GFP_KERNEL);
if (!lp->rx_skb) {
dev_err(&ndev->dev,
"can't allocate memory for DMA RX buffer\n");
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 63578925bc59..b2a55e3fde0b 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2759,7 +2759,7 @@ int inline qeth_l3_get_cast_type(struct qeth_card *card, struct sk_buff *skb)
rcu_read_lock();
dst = skb_dst(skb);
if (dst)
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
if (n) {
cast_type = n->type;
rcu_read_unlock();
@@ -2855,7 +2855,7 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
rcu_read_lock();
dst = skb_dst(skb);
if (dst)
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
if (ipv == 4) {
/* IPv4 */
hdr->hdr.l3.flags = qeth_l3_get_qeth_hdr_flags4(cast_type);
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index 000294a9df80..36739da8bc15 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -966,7 +966,7 @@ static int init_act_open(struct cxgbi_sock *csk)
csk->saddr.sin_addr.s_addr = chba->ipv4addr;
csk->rss_qid = 0;
- csk->l2t = t3_l2t_get(t3dev, dst_get_neighbour(dst), ndev);
+ csk->l2t = t3_l2t_get(t3dev, dst, ndev);
if (!csk->l2t) {
pr_err("NO l2t available.\n");
return -EINVAL;
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index ac7a9b1e3e23..5a4a3bfc60cf 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -1127,6 +1127,7 @@ static int init_act_open(struct cxgbi_sock *csk)
struct net_device *ndev = cdev->ports[csk->port_id];
struct port_info *pi = netdev_priv(ndev);
struct sk_buff *skb = NULL;
+ struct neighbour *n;
unsigned int step;
log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
@@ -1141,7 +1142,12 @@ static int init_act_open(struct cxgbi_sock *csk)
cxgbi_sock_set_flag(csk, CTPF_HAS_ATID);
cxgbi_sock_get(csk);
- csk->l2t = cxgb4_l2t_get(lldi->l2t, dst_get_neighbour(csk->dst), ndev, 0);
+ n = dst_get_neighbour_noref(csk->dst);
+ if (!n) {
+ pr_err("%s, can't get neighbour of csk->dst.\n", ndev->name);
+ goto rel_resource;
+ }
+ csk->l2t = cxgb4_l2t_get(lldi->l2t, n, ndev, 0);
if (!csk->l2t) {
pr_err("%s, cannot alloc l2t.\n", ndev->name);
goto rel_resource;
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index c10f74a566f2..1d25a87aa47b 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -472,6 +472,7 @@ static struct cxgbi_sock *cxgbi_check_route(struct sockaddr *dst_addr)
struct net_device *ndev;
struct cxgbi_device *cdev;
struct rtable *rt = NULL;
+ struct neighbour *n;
struct flowi4 fl4;
struct cxgbi_sock *csk = NULL;
unsigned int mtu = 0;
@@ -493,7 +494,12 @@ static struct cxgbi_sock *cxgbi_check_route(struct sockaddr *dst_addr)
goto err_out;
}
dst = &rt->dst;
- ndev = dst_get_neighbour(dst)->dev;
+ n = dst_get_neighbour_noref(dst);
+ if (!n) {
+ err = -ENODEV;
+ goto rel_rt;
+ }
+ ndev = n->dev;
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
pr_info("multi-cast route %pI4, port %u, dev %s.\n",
@@ -507,7 +513,7 @@ static struct cxgbi_sock *cxgbi_check_route(struct sockaddr *dst_addr)
ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr);
mtu = ndev->mtu;
pr_info("rt dev %s, loopback -> %s, mtu %u.\n",
- dst_get_neighbour(dst)->dev->name, ndev->name, mtu);
+ n->dev->name, ndev->name, mtu);
}
cdev = cxgbi_device_find_by_netdev(ndev, &port);
diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index 61549b26ad6f..73c28dea10ae 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -85,6 +85,30 @@ enum {
/* All generic netlink requests are serialized by a global lock. */
extern void genl_lock(void);
extern void genl_unlock(void);
+#ifdef CONFIG_PROVE_LOCKING
+extern int lockdep_genl_is_held(void);
+#endif
+
+/**
+ * rcu_dereference_genl - rcu_dereference with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
+ * or genl mutex. Note : Please prefer genl_dereference() or rcu_dereference()
+ */
+#define rcu_dereference_genl(p) \
+ rcu_dereference_check(p, lockdep_genl_is_held())
+
+/**
+ * genl_dereference - fetch RCU pointer when updates are prevented by genl mutex
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Return the value of the specified RCU-protected pointer, but omit
+ * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * caller holds genl mutex.
+ */
+#define genl_dereference(p) \
+ rcu_dereference_protected(p, lockdep_genl_is_held())
#endif /* __KERNEL__ */
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 12d5543b14f2..070ac50c1d2d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -310,6 +310,40 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
return protocol;
}
+
+static inline void vlan_set_encap_proto(struct sk_buff *skb,
+ struct vlan_hdr *vhdr)
+{
+ __be16 proto;
+ unsigned char *rawp;
+
+ /*
+ * Was a VLAN packet, grab the encapsulated protocol, which the layer
+ * three protocols care about.
+ */
+
+ proto = vhdr->h_vlan_encapsulated_proto;
+ if (ntohs(proto) >= 1536) {
+ skb->protocol = proto;
+ return;
+ }
+
+ rawp = skb->data;
+ if (*(unsigned short *) rawp == 0xFFFF)
+ /*
+ * This is a magic hack to spot IPX packets. Older Novell
+ * breaks the protocol design and runs IPX over 802.3 without
+ * an 802.2 LLC layer. We look for FFFF which isn't a used
+ * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
+ * but does for the rest.
+ */
+ skb->protocol = htons(ETH_P_802_3);
+ else
+ /*
+ * Real 802.2 LLC
+ */
+ skb->protocol = htons(ETH_P_802_2);
+}
#endif /* __KERNEL__ */
/* VLAN IOCTLs are found in sockios.h */
diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
new file mode 100644
index 000000000000..eb1efa54fe84
--- /dev/null
+++ b/include/linux/openvswitch.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef _LINUX_OPENVSWITCH_H
+#define _LINUX_OPENVSWITCH_H 1
+
+#include <linux/types.h>
+
+/**
+ * struct ovs_header - header for OVS Generic Netlink messages.
+ * @dp_ifindex: ifindex of local port for datapath (0 to make a request not
+ * specific to a datapath).
+ *
+ * Attributes following the header are specific to a particular OVS Generic
+ * Netlink family, but all of the OVS families use this header.
+ */
+
+struct ovs_header {
+ int dp_ifindex;
+};
+
+/* Datapaths. */
+
+#define OVS_DATAPATH_FAMILY "ovs_datapath"
+#define OVS_DATAPATH_MCGROUP "ovs_datapath"
+#define OVS_DATAPATH_VERSION 0x1
+
+enum ovs_datapath_cmd {
+ OVS_DP_CMD_UNSPEC,
+ OVS_DP_CMD_NEW,
+ OVS_DP_CMD_DEL,
+ OVS_DP_CMD_GET,
+ OVS_DP_CMD_SET
+};
+
+/**
+ * enum ovs_datapath_attr - attributes for %OVS_DP_* commands.
+ * @OVS_DP_ATTR_NAME: Name of the network device that serves as the "local
+ * port". This is the name of the network device whose dp_ifindex is given in
+ * the &struct ovs_header. Always present in notifications. Required in
+ * %OVS_DP_NEW requests. May be used as an alternative to specifying
+ * dp_ifindex in other requests (with a dp_ifindex of 0).
+ * @OVS_DP_ATTR_UPCALL_PID: The Netlink socket in userspace that is initially
+ * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on
+ * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
+ * not be sent.
+ * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
+ * datapath. Always present in notifications.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_DP_* commands.
+ */
+enum ovs_datapath_attr {
+ OVS_DP_ATTR_UNSPEC,
+ OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */
+ OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
+ OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */
+ __OVS_DP_ATTR_MAX
+};
+
+#define OVS_DP_ATTR_MAX (__OVS_DP_ATTR_MAX - 1)
+
+struct ovs_dp_stats {
+ __u64 n_hit; /* Number of flow table matches. */
+ __u64 n_missed; /* Number of flow table misses. */
+ __u64 n_lost; /* Number of misses not sent to userspace. */
+ __u64 n_flows; /* Number of flows present */
+};
+
+struct ovs_vport_stats {
+ __u64 rx_packets; /* total packets received */
+ __u64 tx_packets; /* total packets transmitted */
+ __u64 rx_bytes; /* total bytes received */
+ __u64 tx_bytes; /* total bytes transmitted */
+ __u64 rx_errors; /* bad packets received */
+ __u64 tx_errors; /* packet transmit problems */
+ __u64 rx_dropped; /* no space in linux buffers */
+ __u64 tx_dropped; /* no space available in linux */
+};
+
+/* Fixed logical ports. */
+#define OVSP_LOCAL ((__u16)0)
+
+/* Packet transfer. */
+
+#define OVS_PACKET_FAMILY "ovs_packet"
+#define OVS_PACKET_VERSION 0x1
+
+enum ovs_packet_cmd {
+ OVS_PACKET_CMD_UNSPEC,
+
+ /* Kernel-to-user notifications. */
+ OVS_PACKET_CMD_MISS, /* Flow table miss. */
+ OVS_PACKET_CMD_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
+
+ /* Userspace commands. */
+ OVS_PACKET_CMD_EXECUTE /* Apply actions to a packet. */
+};
+
+/**
+ * enum ovs_packet_attr - attributes for %OVS_PACKET_* commands.
+ * @OVS_PACKET_ATTR_PACKET: Present for all notifications. Contains the entire
+ * packet as received, from the start of the Ethernet header onward. For
+ * %OVS_PACKET_CMD_ACTION, %OVS_PACKET_ATTR_PACKET reflects changes made by
+ * actions preceding %OVS_ACTION_ATTR_USERSPACE, but %OVS_PACKET_ATTR_KEY is
+ * the flow key extracted from the packet as originally received.
+ * @OVS_PACKET_ATTR_KEY: Present for all notifications. Contains the flow key
+ * extracted from the packet as nested %OVS_KEY_ATTR_* attributes. This allows
+ * userspace to adapt its flow setup strategy by comparing its notion of the
+ * flow key against the kernel's.
+ * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet. Used
+ * for %OVS_PACKET_CMD_EXECUTE. It has nested %OVS_ACTION_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
+ * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
+ * %OVS_USERSPACE_ATTR_USERDATA attribute.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_PACKET_* commands.
+ */
+enum ovs_packet_attr {
+ OVS_PACKET_ATTR_UNSPEC,
+ OVS_PACKET_ATTR_PACKET, /* Packet data. */
+ OVS_PACKET_ATTR_KEY, /* Nested OVS_KEY_ATTR_* attributes. */
+ OVS_PACKET_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
+ OVS_PACKET_ATTR_USERDATA, /* u64 OVS_ACTION_ATTR_USERSPACE arg. */
+ __OVS_PACKET_ATTR_MAX
+};
+
+#define OVS_PACKET_ATTR_MAX (__OVS_PACKET_ATTR_MAX - 1)
+
+/* Virtual ports. */
+
+#define OVS_VPORT_FAMILY "ovs_vport"
+#define OVS_VPORT_MCGROUP "ovs_vport"
+#define OVS_VPORT_VERSION 0x1
+
+enum ovs_vport_cmd {
+ OVS_VPORT_CMD_UNSPEC,
+ OVS_VPORT_CMD_NEW,
+ OVS_VPORT_CMD_DEL,
+ OVS_VPORT_CMD_GET,
+ OVS_VPORT_CMD_SET
+};
+
+enum ovs_vport_type {
+ OVS_VPORT_TYPE_UNSPEC,
+ OVS_VPORT_TYPE_NETDEV, /* network device */
+ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
+ __OVS_VPORT_TYPE_MAX
+};
+
+#define OVS_VPORT_TYPE_MAX (__OVS_VPORT_TYPE_MAX - 1)
+
+/**
+ * enum ovs_vport_attr - attributes for %OVS_VPORT_* commands.
+ * @OVS_VPORT_ATTR_PORT_NO: 32-bit port number within datapath.
+ * @OVS_VPORT_ATTR_TYPE: 32-bit %OVS_VPORT_TYPE_* constant describing the type
+ * of vport.
+ * @OVS_VPORT_ATTR_NAME: Name of vport. For a vport based on a network device
+ * this is the name of the network device. Maximum length %IFNAMSIZ-1 bytes
+ * plus a null terminator.
+ * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information.
+ * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that
+ * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on
+ * this port. A value of zero indicates that upcalls should not be sent.
+ * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for
+ * packets sent or received through the vport.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_VPORT_* commands.
+ *
+ * For %OVS_VPORT_CMD_NEW requests, the %OVS_VPORT_ATTR_TYPE and
+ * %OVS_VPORT_ATTR_NAME attributes are required. %OVS_VPORT_ATTR_PORT_NO is
+ * optional; if not specified a free port number is automatically selected.
+ * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type
+ * of vport.
+ * and other attributes are ignored.
+ *
+ * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to
+ * look up the vport to operate on; otherwise dp_idx from the &struct
+ * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport.
+ */
+enum ovs_vport_attr {
+ OVS_VPORT_ATTR_UNSPEC,
+ OVS_VPORT_ATTR_PORT_NO, /* u32 port number within datapath */
+ OVS_VPORT_ATTR_TYPE, /* u32 OVS_VPORT_TYPE_* constant. */
+ OVS_VPORT_ATTR_NAME, /* string name, up to IFNAMSIZ bytes long */
+ OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */
+ OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */
+ OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */
+ __OVS_VPORT_ATTR_MAX
+};
+
+#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
+
+/* Flows. */
+
+#define OVS_FLOW_FAMILY "ovs_flow"
+#define OVS_FLOW_MCGROUP "ovs_flow"
+#define OVS_FLOW_VERSION 0x1
+
+enum ovs_flow_cmd {
+ OVS_FLOW_CMD_UNSPEC,
+ OVS_FLOW_CMD_NEW,
+ OVS_FLOW_CMD_DEL,
+ OVS_FLOW_CMD_GET,
+ OVS_FLOW_CMD_SET
+};
+
+struct ovs_flow_stats {
+ __u64 n_packets; /* Number of matched packets. */
+ __u64 n_bytes; /* Number of matched bytes. */
+};
+
+enum ovs_key_attr {
+ OVS_KEY_ATTR_UNSPEC,
+ OVS_KEY_ATTR_ENCAP, /* Nested set of encapsulated attributes. */
+ OVS_KEY_ATTR_PRIORITY, /* u32 skb->priority */
+ OVS_KEY_ATTR_IN_PORT, /* u32 OVS dp port number */
+ OVS_KEY_ATTR_ETHERNET, /* struct ovs_key_ethernet */
+ OVS_KEY_ATTR_VLAN, /* be16 VLAN TCI */
+ OVS_KEY_ATTR_ETHERTYPE, /* be16 Ethernet type */
+ OVS_KEY_ATTR_IPV4, /* struct ovs_key_ipv4 */
+ OVS_KEY_ATTR_IPV6, /* struct ovs_key_ipv6 */
+ OVS_KEY_ATTR_TCP, /* struct ovs_key_tcp */
+ OVS_KEY_ATTR_UDP, /* struct ovs_key_udp */
+ OVS_KEY_ATTR_ICMP, /* struct ovs_key_icmp */
+ OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */
+ OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */
+ OVS_KEY_ATTR_ND, /* struct ovs_key_nd */
+ __OVS_KEY_ATTR_MAX
+};
+
+#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
+
+/**
+ * enum ovs_frag_type - IPv4 and IPv6 fragment type
+ * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
+ * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0.
+ * @OVS_FRAG_TYPE_LATER: Packet is a fragment with nonzero offset.
+ *
+ * Used as the @ipv4_frag in &struct ovs_key_ipv4 and as @ipv6_frag &struct
+ * ovs_key_ipv6.
+ */
+enum ovs_frag_type {
+ OVS_FRAG_TYPE_NONE,
+ OVS_FRAG_TYPE_FIRST,
+ OVS_FRAG_TYPE_LATER,
+ __OVS_FRAG_TYPE_MAX
+};
+
+#define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1)
+
+struct ovs_key_ethernet {
+ __u8 eth_src[6];
+ __u8 eth_dst[6];
+};
+
+struct ovs_key_ipv4 {
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ __u8 ipv4_proto;
+ __u8 ipv4_tos;
+ __u8 ipv4_ttl;
+ __u8 ipv4_frag; /* One of OVS_FRAG_TYPE_*. */
+};
+
+struct ovs_key_ipv6 {
+ __be32 ipv6_src[4];
+ __be32 ipv6_dst[4];
+ __be32 ipv6_label; /* 20-bits in least-significant bits. */
+ __u8 ipv6_proto;
+ __u8 ipv6_tclass;
+ __u8 ipv6_hlimit;
+ __u8 ipv6_frag; /* One of OVS_FRAG_TYPE_*. */
+};
+
+struct ovs_key_tcp {
+ __be16 tcp_src;
+ __be16 tcp_dst;
+};
+
+struct ovs_key_udp {
+ __be16 udp_src;
+ __be16 udp_dst;
+};
+
+struct ovs_key_icmp {
+ __u8 icmp_type;
+ __u8 icmp_code;
+};
+
+struct ovs_key_icmpv6 {
+ __u8 icmpv6_type;
+ __u8 icmpv6_code;
+};
+
+struct ovs_key_arp {
+ __be32 arp_sip;
+ __be32 arp_tip;
+ __be16 arp_op;
+ __u8 arp_sha[6];
+ __u8 arp_tha[6];
+};
+
+struct ovs_key_nd {
+ __u32 nd_target[4];
+ __u8 nd_sll[6];
+ __u8 nd_tll[6];
+};
+
+/**
+ * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
+ * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
+ * key. Always present in notifications. Required for all requests (except
+ * dumps).
+ * @OVS_FLOW_ATTR_ACTIONS: Nested %OVS_ACTION_ATTR_* attributes specifying
+ * the actions to take for packets that match the key. Always present in
+ * notifications. Required for %OVS_FLOW_CMD_NEW requests, optional for
+ * %OVS_FLOW_CMD_SET requests.
+ * @OVS_FLOW_ATTR_STATS: &struct ovs_flow_stats giving statistics for this
+ * flow. Present in notifications if the stats would be nonzero. Ignored in
+ * requests.
+ * @OVS_FLOW_ATTR_TCP_FLAGS: An 8-bit value giving the OR'd value of all of the
+ * TCP flags seen on packets in this flow. Only present in notifications for
+ * TCP flows, and only if it would be nonzero. Ignored in requests.
+ * @OVS_FLOW_ATTR_USED: A 64-bit integer giving the time, in milliseconds on
+ * the system monotonic clock, at which a packet was last processed for this
+ * flow. Only present in notifications if a packet has been processed for this
+ * flow. Ignored in requests.
+ * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
+ * last-used time, accumulated TCP flags, and statistics for this flow.
+ * Otherwise ignored in requests. Never present in notifications.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_FLOW_* commands.
+ */
+enum ovs_flow_attr {
+ OVS_FLOW_ATTR_UNSPEC,
+ OVS_FLOW_ATTR_KEY, /* Sequence of OVS_KEY_ATTR_* attributes. */
+ OVS_FLOW_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
+ OVS_FLOW_ATTR_STATS, /* struct ovs_flow_stats. */
+ OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
+ OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */
+ OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */
+ __OVS_FLOW_ATTR_MAX
+};
+
+#define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1)
+
+/**
+ * enum ovs_sample_attr - Attributes for %OVS_ACTION_ATTR_SAMPLE action.
+ * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with
+ * @OVS_ACTION_ATTR_SAMPLE. A value of 0 samples no packets, a value of
+ * %UINT32_MAX samples all packets and intermediate values sample intermediate
+ * fractions of packets.
+ * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event.
+ * Actions are passed as nested attributes.
+ *
+ * Executes the specified actions with the given probability on a per-packet
+ * basis.
+ */
+enum ovs_sample_attr {
+ OVS_SAMPLE_ATTR_UNSPEC,
+ OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
+ OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
+ __OVS_SAMPLE_ATTR_MAX,
+};
+
+#define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
+
+/**
+ * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
+ * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
+ * message should be sent. Required.
+ * @OVS_USERSPACE_ATTR_USERDATA: If present, its u64 argument is copied to the
+ * %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA,
+ */
+enum ovs_userspace_attr {
+ OVS_USERSPACE_ATTR_UNSPEC,
+ OVS_USERSPACE_ATTR_PID, /* u32 Netlink PID to receive upcalls. */
+ OVS_USERSPACE_ATTR_USERDATA, /* u64 optional user-specified cookie. */
+ __OVS_USERSPACE_ATTR_MAX
+};
+
+#define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
+
+/**
+ * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
+ * @vlan_tpid: Tag protocol identifier (TPID) to push.
+ * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set
+ * (but it will not be set in the 802.1Q header that is pushed).
+ *
+ * The @vlan_tpid value is typically %ETH_P_8021Q. The only acceptable TPID
+ * values are those that the kernel module also parses as 802.1Q headers, to
+ * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
+ * from having surprising results.
+ */
+struct ovs_action_push_vlan {
+ __be16 vlan_tpid; /* 802.1Q TPID. */
+ __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */
+};
+
+/**
+ * enum ovs_action_attr - Action types.
+ *
+ * @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
+ * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested
+ * %OVS_USERSPACE_ATTR_* attributes.
+ * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header. The
+ * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and its
+ * value.
+ * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
+ * packet.
+ * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
+ * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
+ * the nested %OVS_SAMPLE_ATTR_* attributes.
+ *
+ * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
+ * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
+ * type may not be changed.
+ */
+
+enum ovs_action_attr {
+ OVS_ACTION_ATTR_UNSPEC,
+ OVS_ACTION_ATTR_OUTPUT, /* u32 port number. */
+ OVS_ACTION_ATTR_USERSPACE, /* Nested OVS_USERSPACE_ATTR_*. */
+ OVS_ACTION_ATTR_SET, /* One nested OVS_KEY_ATTR_*. */
+ OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */
+ OVS_ACTION_ATTR_POP_VLAN, /* No argument. */
+ OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */
+ __OVS_ACTION_ATTR_MAX
+};
+
+#define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
+
+#endif /* _LINUX_OPENVSWITCH_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cec0657d0d32..12e6fed73f8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -568,8 +568,9 @@ extern struct sk_buff *skb_clone(struct sk_buff *skb,
gfp_t priority);
extern struct sk_buff *skb_copy(const struct sk_buff *skb,
gfp_t priority);
-extern struct sk_buff *pskb_copy(struct sk_buff *skb,
- gfp_t gfp_mask);
+extern struct sk_buff *__pskb_copy(struct sk_buff *skb,
+ int headroom, gfp_t gfp_mask);
+
extern int pskb_expand_head(struct sk_buff *skb,
int nhead, int ntail,
gfp_t gfp_mask);
@@ -1799,6 +1800,12 @@ static inline dma_addr_t skb_frag_dma_map(struct device *dev,
frag->page_offset + offset, size, dir);
}
+static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
+ gfp_t gfp_mask)
+{
+ return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
+}
+
/**
* skb_clone_writable - is the header of a clone writable
* @skb: buffer to check
diff --git a/include/net/dst.h b/include/net/dst.h
index 6faec1a60216..01343b043517 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -86,12 +86,12 @@ struct dst_entry {
};
};
-static inline struct neighbour *dst_get_neighbour(struct dst_entry *dst)
+static inline struct neighbour *dst_get_neighbour_noref(struct dst_entry *dst)
{
return rcu_dereference(dst->_neighbour);
}
-static inline struct neighbour *dst_get_neighbour_raw(struct dst_entry *dst)
+static inline struct neighbour *dst_get_neighbour_noref_raw(struct dst_entry *dst)
{
return rcu_dereference_raw(dst->_neighbour);
}
@@ -392,7 +392,7 @@ static inline void dst_confirm(struct dst_entry *dst)
struct neighbour *n;
rcu_read_lock();
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
neigh_confirm(n);
rcu_read_unlock();
}
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 82d8d09faa44..7db32995ccd3 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -128,6 +128,8 @@ extern int genl_register_mc_group(struct genl_family *family,
struct genl_multicast_group *grp);
extern void genl_unregister_mc_group(struct genl_family *family,
struct genl_multicast_group *grp);
+extern void genl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+ u32 group, struct nlmsghdr *nlh, gfp_t flags);
/**
* genlmsg_put - Add generic netlink header to netlink message
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f35188e002d9..e4170a22fc6f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -558,7 +558,7 @@ extern void ipv6_push_frag_opts(struct sk_buff *skb,
u8 *proto);
extern int ipv6_skip_exthdr(const struct sk_buff *, int start,
- u8 *nexthdrp);
+ u8 *nexthdrp, __be16 *frag_offp);
extern int ipv6_ext_hdr(u8 nexthdr);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 62beeb97c4b1..c977c377c015 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -145,13 +145,4 @@ int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl,
extern void inet6_ifinfo_notify(int event,
struct inet6_dev *idev);
-static inline struct neighbour * ndisc_get_neigh(struct net_device *dev, const struct in6_addr *addr)
-{
-
- if (dev)
- return __neigh_lookup_errno(&nd_tbl, addr, dev);
-
- return ERR_PTR(-ENODEV);
-}
-
#endif
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index f5ffc02729d6..9c95e8e054f9 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -110,39 +110,6 @@ static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
return skb;
}
-static void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr)
-{
- __be16 proto;
- unsigned char *rawp;
-
- /*
- * Was a VLAN packet, grab the encapsulated protocol, which the layer
- * three protocols care about.
- */
-
- proto = vhdr->h_vlan_encapsulated_proto;
- if (ntohs(proto) >= 1536) {
- skb->protocol = proto;
- return;
- }
-
- rawp = skb->data;
- if (*(unsigned short *) rawp == 0xFFFF)
- /*
- * This is a magic hack to spot IPX packets. Older Novell
- * breaks the protocol design and runs IPX over 802.3 without
- * an 802.2 LLC layer. We look for FFFF which isn't a used
- * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
- * but does for the rest.
- */
- skb->protocol = htons(ETH_P_802_3);
- else
- /*
- * Real 802.2 LLC
- */
- skb->protocol = htons(ETH_P_802_2);
-}
-
struct sk_buff *vlan_untag(struct sk_buff *skb)
{
struct vlan_hdr *vhdr;
diff --git a/net/Kconfig b/net/Kconfig
index 2d998735c4d8..e07272d0bb2d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -215,6 +215,7 @@ source "net/sched/Kconfig"
source "net/dcb/Kconfig"
source "net/dns_resolver/Kconfig"
source "net/batman-adv/Kconfig"
+source "net/openvswitch/Kconfig"
config RPS
boolean
diff --git a/net/Makefile b/net/Makefile
index acdde4950de4..ad432fa4d934 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -69,3 +69,4 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
obj-$(CONFIG_NFC) += nfc/
+obj-$(CONFIG_OPENVSWITCH) += openvswitch/
diff --git a/net/atm/clip.c b/net/atm/clip.c
index c84ce7fe3f9b..c12c2582457c 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -338,7 +338,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
if (!n) {
pr_err("NO NEIGHBOUR !\n");
dev_kfree_skb(skb);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 7743e0d109ea..375417e633c9 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1458,6 +1458,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
const struct ipv6hdr *ip6h;
u8 icmp6_type;
u8 nexthdr;
+ __be16 frag_off;
unsigned len;
int offset;
int err;
@@ -1483,7 +1484,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
return -EINVAL;
nexthdr = ip6h->nexthdr;
- offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr);
+ offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
return 0;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index d6ec3720c77e..834dfabb30f9 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
if (!skb->dev)
goto free_skb;
dst = skb_dst(skb);
- neigh = dst_get_neighbour(dst);
+ neigh = dst_get_neighbour_noref(dst);
if (neigh->hh.hh_len) {
neigh_hh_bridge(&neigh->hh, skb);
skb->dev = nf_bridge->physindev;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 2ed0056a39a8..99c85668f551 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -55,9 +55,10 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (info->bitmask & EBT_IP6_PROTO) {
uint8_t nexthdr = ih6->nexthdr;
+ __be16 frag_off;
int offset_ph;
- offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr);
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off);
if (offset_ph == -1)
return false;
if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 6e5a8bb9b940..88d7d1d1cb1b 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -113,6 +113,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
const struct ipv6hdr *ih;
struct ipv6hdr _iph;
uint8_t nexthdr;
+ __be16 frag_off;
int offset_ph;
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
@@ -123,7 +124,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
&ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
nexthdr = ih->nexthdr;
- offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr);
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off);
if (offset_ph == -1)
goto out;
print_ports(skb, nexthdr, offset_ph);
diff --git a/net/core/dst.c b/net/core/dst.c
index d5e2c4c09107..43d94cedbf7c 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -366,7 +366,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
dev_hold(dst->dev);
dev_put(dev);
rcu_read_lock();
- neigh = dst_get_neighbour(dst);
+ neigh = dst_get_neighbour_noref(dst);
if (neigh && neigh->dev == dev) {
neigh->dev = dst->dev;
dev_hold(dst->dev);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cdf8dc34f0ba..4af151e1bf5d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1190,7 +1190,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
rcu_read_lock();
/* On shaper/eql skb->dst->neighbour != neigh :( */
- if (dst && (n2 = dst_get_neighbour(dst)) != NULL)
+ if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL)
n1 = n2;
n1->output(n1, skb);
rcu_read_unlock();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 678ae4e783aa..fd3646209b65 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -840,8 +840,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and part of its data, located
@@ -852,16 +853,16 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
{
- unsigned int size = skb_end_pointer(skb) - skb->head;
+ unsigned int size = skb_headlen(skb) + headroom;
struct sk_buff *n = alloc_skb(size, gfp_mask);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb_headroom(skb));
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
@@ -897,7 +898,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(pskb_copy);
+EXPORT_SYMBOL(__pskb_copy);
/**
* pskb_expand_head - reallocate header of &sk_buff
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 3532ac64c82d..7d2fff29380f 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -202,7 +202,7 @@ static int dn_neigh_output_packet(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct dn_route *rt = (struct dn_route *)dst;
- struct neighbour *neigh = dst_get_neighbour(dst);
+ struct neighbour *neigh = dst_get_neighbour_noref(dst);
struct net_device *dev = neigh->dev;
char mac_addr[ETH_ALEN];
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 94f4ec036669..f31ce72dca65 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -244,7 +244,7 @@ static int dn_dst_gc(struct dst_ops *ops)
*/
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
{
- struct neighbour *n = dst_get_neighbour(dst);
+ struct neighbour *n = dst_get_neighbour_noref(dst);
u32 min_mtu = 230;
struct dn_dev *dn;
@@ -713,7 +713,7 @@ out:
static int dn_to_neigh_output(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n = dst_get_neighbour(dst);
+ struct neighbour *n = dst_get_neighbour_noref(dst);
return n->output(n, skb);
}
@@ -728,7 +728,7 @@ static int dn_output(struct sk_buff *skb)
int err = -EINVAL;
- if ((neigh = dst_get_neighbour(dst)) == NULL)
+ if ((neigh = dst_get_neighbour_noref(dst)) == NULL)
goto error;
skb->dev = dev;
@@ -852,7 +852,7 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
}
rt->rt_type = res->type;
- if (dev != NULL && dst_get_neighbour(&rt->dst) == NULL) {
+ if (dev != NULL && dst_get_neighbour_noref(&rt->dst) == NULL) {
n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
if (IS_ERR(n))
return PTR_ERR(n);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2b32296b7958..fe070c1593ab 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -731,7 +731,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
+ struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
const struct in6_addr *addr6;
int addr_type;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0d5e5672f3d1..ff302bde8890 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -206,7 +206,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
}
rcu_read_lock();
- neigh = dst_get_neighbour(dst);
+ neigh = dst_get_neighbour_noref(dst);
if (neigh) {
int res = neigh_output(neigh, skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7047069cf967..90402a2a26a9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -419,7 +419,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
int len, HHUptod;
rcu_read_lock();
- n = dst_get_neighbour(&r->dst);
+ n = dst_get_neighbour_noref(&r->dst);
HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
rcu_read_unlock();
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 45156be3abfd..a09fe253b917 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1009,7 +1009,12 @@ new_segment:
int merge = 0;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = TCP_PAGE(sk);
- int off = TCP_OFF(sk);
+ int off;
+
+ if (page && page_count(page) == 1)
+ TCP_OFF(sk) = 0;
+
+ off = TCP_OFF(sk);
if (skb_can_coalesce(skb, i, page, off) &&
off != PAGE_SIZE) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78dd38cd5496..0cbb44076cfa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5811,6 +5811,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
if (th->syn) {
+ if (th->fin)
+ goto discard;
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 58f69acd3d22..50788d67bdb7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2147,7 +2147,15 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ /* make sure skb->data is aligned on arches that require it */
+ if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
+ struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } else {
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
if (err == 0) {
/* Update global TCP statistics. */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 586051726341..058cc222b3f1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -657,7 +657,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
* layer address of our nexhop router
*/
- if (dst_get_neighbour_raw(&rt->dst) == NULL)
+ if (dst_get_neighbour_noref_raw(&rt->dst) == NULL)
ifa->flags &= ~IFA_F_OPTIMISTIC;
ifa->idev = idev;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 37f548b7f6dc..72957f4a7c6c 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -57,6 +57,9 @@ int ipv6_ext_hdr(u8 nexthdr)
* it returns NULL.
* - First fragment header is skipped, not-first ones
* are considered as unparsable.
+ * - Reports the offset field of the final fragment header so it is
+ * possible to tell whether this is a first fragment, later fragment,
+ * or not fragmented.
* - ESP is unparsable for now and considered like
* normal payload protocol.
* - Note also special handling of AUTH header. Thanks to IPsec wizards.
@@ -64,10 +67,13 @@ int ipv6_ext_hdr(u8 nexthdr)
* --ANK (980726)
*/
-int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
+ __be16 *frag_offp)
{
u8 nexthdr = *nexthdrp;
+ *frag_offp = 0;
+
while (ipv6_ext_hdr(nexthdr)) {
struct ipv6_opt_hdr _hdr, *hp;
int hdrlen;
@@ -87,7 +93,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
if (fp == NULL)
return -1;
- if (ntohs(*fp) & ~0x7)
+ *frag_offp = *fp;
+ if (ntohs(*frag_offp) & ~0x7)
break;
hdrlen = 8;
} else if (nexthdr == NEXTHDR_AUTH)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 9e2bdccf9143..01d46bff63c3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -135,11 +135,12 @@ static int is_ineligible(struct sk_buff *skb)
int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
int len = skb->len - ptr;
__u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
if (len < 0)
return 1;
- ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr);
+ ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off);
if (ptr < 0)
return 0;
if (nexthdr == IPPROTO_ICMPV6) {
@@ -596,6 +597,7 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
int inner_offset;
int hash;
u8 nexthdr;
+ __be16 frag_off;
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
return;
@@ -603,7 +605,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
if (ipv6_ext_hdr(nexthdr)) {
/* now skip over extension headers */
- inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
if (inner_offset<0)
return;
} else {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 424f063fb229..278363123657 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -190,7 +190,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
struct fib6_table *table;
table = kzalloc(sizeof(*table), GFP_ATOMIC);
- if (table != NULL) {
+ if (table) {
table->tb6_id = id;
table->tb6_root.leaf = net->ipv6.ip6_null_entry;
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
@@ -210,7 +210,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id)
return tb;
tb = fib6_alloc_table(net, id);
- if (tb != NULL)
+ if (tb)
fib6_link_table(net, tb);
return tb;
@@ -367,7 +367,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
s_e = cb->args[1];
w = (void *)cb->args[2];
- if (w == NULL) {
+ if (!w) {
/* New dump:
*
* 1. hook callback destructor.
@@ -379,7 +379,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
* 2. allocate and initialize walker.
*/
w = kzalloc(sizeof(*w), GFP_ATOMIC);
- if (w == NULL)
+ if (!w)
return -ENOMEM;
w->func = fib6_dump_node;
cb->args[2] = (long)w;
@@ -467,7 +467,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
if (plen == fn->fn_bit) {
/* clean up an intermediate node */
- if ((fn->fn_flags & RTN_RTINFO) == 0) {
+ if (!(fn->fn_flags & RTN_RTINFO)) {
rt6_release(fn->leaf);
fn->leaf = NULL;
}
@@ -512,7 +512,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
ln = node_alloc();
- if (ln == NULL)
+ if (!ln)
return NULL;
ln->fn_bit = plen;
@@ -555,7 +555,7 @@ insert_above:
in = node_alloc();
ln = node_alloc();
- if (in == NULL || ln == NULL) {
+ if (!in || !ln) {
if (in)
node_free(in);
if (ln)
@@ -609,7 +609,7 @@ insert_above:
ln = node_alloc();
- if (ln == NULL)
+ if (!ln)
return NULL;
ln->fn_bit = plen;
@@ -642,15 +642,15 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
{
struct rt6_info *iter = NULL;
struct rt6_info **ins;
- int replace = (NULL != info->nlh &&
- (info->nlh->nlmsg_flags&NLM_F_REPLACE));
- int add = (NULL == info->nlh ||
- (info->nlh->nlmsg_flags&NLM_F_CREATE));
+ int replace = (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+ int add = (!info->nlh ||
+ (info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
ins = &fn->leaf;
- for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) {
+ for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
/*
* Search for duplicates
*/
@@ -659,8 +659,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
/*
* Same priority level
*/
- if (NULL != info->nlh &&
- (info->nlh->nlmsg_flags&NLM_F_EXCL))
+ if (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_EXCL))
return -EEXIST;
if (replace) {
found++;
@@ -671,10 +671,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
iter->rt6i_idev == rt->rt6i_idev &&
ipv6_addr_equal(&iter->rt6i_gateway,
&rt->rt6i_gateway)) {
- if (!(iter->rt6i_flags&RTF_EXPIRES))
+ if (!(iter->rt6i_flags & RTF_EXPIRES))
return -EEXIST;
iter->rt6i_expires = rt->rt6i_expires;
- if (!(rt->rt6i_flags&RTF_EXPIRES)) {
+ if (!(rt->rt6i_flags & RTF_EXPIRES)) {
iter->rt6i_flags &= ~RTF_EXPIRES;
iter->rt6i_expires = 0;
}
@@ -707,7 +707,7 @@ add:
inet6_rt_notify(RTM_NEWROUTE, rt, info);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
- if ((fn->fn_flags & RTN_RTINFO) == 0) {
+ if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
@@ -725,7 +725,7 @@ add:
atomic_inc(&rt->rt6i_ref);
inet6_rt_notify(RTM_NEWROUTE, rt, info);
rt6_release(iter);
- if ((fn->fn_flags & RTN_RTINFO) == 0) {
+ if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
@@ -737,7 +737,7 @@ add:
static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
- (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
+ (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
mod_timer(&net->ipv6.ip6_fib_timer,
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
@@ -761,25 +761,26 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
int err = -ENOMEM;
int allow_create = 1;
int replace_required = 0;
- if (NULL != info->nlh) {
- if (!(info->nlh->nlmsg_flags&NLM_F_CREATE))
+
+ if (info->nlh) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
- if ((info->nlh->nlmsg_flags&NLM_F_REPLACE))
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
replace_required = 1;
}
if (!allow_create && !replace_required)
pr_warn("IPv6: RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
- rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst),
- allow_create, replace_required);
+ rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst),
+ allow_create, replace_required);
if (IS_ERR(fn)) {
err = PTR_ERR(fn);
fn = NULL;
}
- if (fn == NULL)
+ if (!fn)
goto out;
pn = fn;
@@ -788,7 +789,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
if (rt->rt6i_src.plen) {
struct fib6_node *sn;
- if (fn->subtree == NULL) {
+ if (!fn->subtree) {
struct fib6_node *sfn;
/*
@@ -803,7 +804,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
/* Create subtree root node */
sfn = node_alloc();
- if (sfn == NULL)
+ if (!sfn)
goto st_failure;
sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
@@ -818,7 +819,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
offsetof(struct rt6_info, rt6i_src),
allow_create, replace_required);
- if (sn == NULL) {
+ if (!sn) {
/* If it is failed, discard just allocated
root, and then (in st_failure) stale node
in main tree.
@@ -840,11 +841,11 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
err = PTR_ERR(sn);
sn = NULL;
}
- if (sn == NULL)
+ if (!sn)
goto st_failure;
}
- if (fn->leaf == NULL) {
+ if (!fn->leaf) {
fn->leaf = rt;
atomic_inc(&rt->rt6i_ref);
}
@@ -853,10 +854,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
#endif
err = fib6_add_rt2node(fn, rt, info);
-
- if (err == 0) {
+ if (!err) {
fib6_start_gc(info->nl_net, rt);
- if (!(rt->rt6i_flags&RTF_CACHE))
+ if (!(rt->rt6i_flags & RTF_CACHE))
fib6_prune_clones(info->nl_net, pn, rt);
}
@@ -904,7 +904,7 @@ st_failure:
*/
struct lookup_args {
- int offset; /* key offset on rt6_info */
+ int offset; /* key offset on rt6_info */
const struct in6_addr *addr; /* search key */
};
@@ -934,11 +934,10 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
fn = next;
continue;
}
-
break;
}
- while(fn) {
+ while (fn) {
if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
struct rt6key *key;
@@ -985,8 +984,7 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *da
};
fn = fib6_lookup_1(root, daddr ? args : args + 1);
-
- if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
+ if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
return fn;
@@ -1046,7 +1044,7 @@ struct fib6_node * fib6_locate(struct fib6_node *root,
}
#endif
- if (fn && fn->fn_flags&RTN_RTINFO)
+ if (fn && fn->fn_flags & RTN_RTINFO)
return fn;
return NULL;
@@ -1060,14 +1058,13 @@ struct fib6_node * fib6_locate(struct fib6_node *root,
static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
{
- if (fn->fn_flags&RTN_ROOT)
+ if (fn->fn_flags & RTN_ROOT)
return net->ipv6.ip6_null_entry;
- while(fn) {
- if(fn->left)
+ while (fn) {
+ if (fn->left)
return fn->left->leaf;
-
- if(fn->right)
+ if (fn->right)
return fn->right->leaf;
fn = FIB6_SUBTREE(fn);
@@ -1105,12 +1102,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
/* Subtree root (i.e. fn) may have one child */
- || (children && fn->fn_flags&RTN_ROOT)
+ || (children && fn->fn_flags & RTN_ROOT)
#endif
) {
fn->leaf = fib6_find_prefix(net, fn);
#if RT6_DEBUG >= 2
- if (fn->leaf==NULL) {
+ if (!fn->leaf) {
WARN_ON(!fn->leaf);
fn->leaf = net->ipv6.ip6_null_entry;
}
@@ -1143,7 +1140,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
read_lock(&fib6_walker_lock);
FOR_WALKERS(w) {
- if (child == NULL) {
+ if (!child) {
if (w->root == fn) {
w->root = w->node = NULL;
RT6_TRACE("W %p adjusted by delroot 1\n", w);
@@ -1172,7 +1169,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
read_unlock(&fib6_walker_lock);
node_free(fn);
- if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn))
+ if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
return pn;
rt6_release(pn->leaf);
@@ -1206,7 +1203,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
w->leaf = rt->dst.rt6_next;
- if (w->leaf == NULL)
+ if (!w->leaf)
w->state = FWS_U;
}
}
@@ -1215,7 +1212,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
rt->dst.rt6_next = NULL;
/* If it was last route, expunge its radix tree node */
- if (fn->leaf == NULL) {
+ if (!fn->leaf) {
fn->fn_flags &= ~RTN_RTINFO;
net->ipv6.rt6_stats->fib_route_nodes--;
fn = fib6_repair_tree(net, fn);
@@ -1229,7 +1226,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
* to still alive ones.
*/
while (fn) {
- if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
+ if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
fn->leaf = fib6_find_prefix(net, fn);
atomic_inc(&fn->leaf->rt6i_ref);
rt6_release(rt);
@@ -1256,17 +1253,17 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
return -ENOENT;
}
#endif
- if (fn == NULL || rt == net->ipv6.ip6_null_entry)
+ if (!fn || rt == net->ipv6.ip6_null_entry)
return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
- if (!(rt->rt6i_flags&RTF_CACHE)) {
+ if (!(rt->rt6i_flags & RTF_CACHE)) {
struct fib6_node *pn = fn;
#ifdef CONFIG_IPV6_SUBTREES
/* clones of this route might be in another subtree */
if (rt->rt6i_src.plen) {
- while (!(pn->fn_flags&RTN_ROOT))
+ while (!(pn->fn_flags & RTN_ROOT))
pn = pn->parent;
pn = pn->parent;
}
@@ -1317,11 +1314,11 @@ static int fib6_walk_continue(struct fib6_walker_t *w)
for (;;) {
fn = w->node;
- if (fn == NULL)
+ if (!fn)
return 0;
if (w->prune && fn != w->root &&
- fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
+ fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
w->state = FWS_C;
w->leaf = fn->leaf;
}
@@ -1350,7 +1347,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w)
w->state = FWS_C;
w->leaf = fn->leaf;
case FWS_C:
- if (w->leaf && fn->fn_flags&RTN_RTINFO) {
+ if (w->leaf && fn->fn_flags & RTN_RTINFO) {
int err;
if (w->count < w->skip) {
@@ -1524,7 +1521,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
* only if they are not in use now.
*/
- if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
+ if (rt->rt6i_flags & RTF_EXPIRES && rt->rt6i_expires) {
if (time_after(now, rt->rt6i_expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
@@ -1536,7 +1533,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
RT6_TRACE("aging clone %p\n", rt);
return -1;
} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
- (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) {
+ (!(dst_get_neighbour_noref_raw(&rt->dst)->flags & NTF_ROUTER))) {
RT6_TRACE("purging route %p via non-router but gateway\n",
rt);
return -1;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a46c64eb0a66..1ca5d45a12e8 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -280,6 +280,7 @@ int ip6_mc_input(struct sk_buff *skb)
u8 *ptr = skb_network_header(skb) + opt->ra;
struct icmp6hdr *icmp6;
u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
int offset;
/* Check if the value of Router Alert
@@ -293,7 +294,7 @@ int ip6_mc_input(struct sk_buff *skb)
goto out;
}
offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
- &nexthdr);
+ &nexthdr, &frag_off);
if (offset < 0)
goto out;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a24e15557843..71d26999c955 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -136,7 +136,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
}
rcu_read_lock();
- neigh = dst_get_neighbour(dst);
+ neigh = dst_get_neighbour_noref(dst);
if (neigh) {
int res = neigh_output(neigh, skb);
@@ -329,10 +329,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
{
struct ipv6hdr *hdr = ipv6_hdr(skb);
u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
int offset;
if (ipv6_ext_hdr(nexthdr)) {
- offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
+ offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
if (offset < 0)
return 0;
} else
@@ -462,7 +463,7 @@ int ip6_forward(struct sk_buff *skb)
send redirects to source routed frames.
We don't send redirects to frames decapsulated from IPsec.
*/
- n = dst_get_neighbour(dst);
+ n = dst_get_neighbour_noref(dst);
if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
struct in6_addr *target = NULL;
struct rt6_info *rt;
@@ -982,7 +983,7 @@ static int ip6_dst_lookup_tail(struct sock *sk,
* dst entry of the nexthop router
*/
rcu_read_lock();
- n = dst_get_neighbour(*dst);
+ n = dst_get_neighbour_noref(*dst);
if (n && !(n->nud_state & NUD_VALID)) {
struct inet6_ifaddr *ifp;
struct flowi6 fl_gw6;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index cfb9709ac7c9..e72c8af85781 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1238,7 +1238,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
if (rt)
- neigh = dst_get_neighbour(&rt->dst);
+ neigh = dst_get_neighbour_noref(&rt->dst);
if (rt && lifetime == 0) {
neigh_clone(neigh);
@@ -1258,7 +1258,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = dst_get_neighbour(&rt->dst);
+ neigh = dst_get_neighbour_noref(&rt->dst);
if (neigh == NULL) {
ND_PRINTK0(KERN_ERR
"ICMPv6 RA: %s() got default router without neighbour.\n",
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index b5a2aa58a03a..aad2fa41cf46 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -49,6 +49,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
const __u8 tclass = DEFAULT_TOS_VALUE;
struct dst_entry *dst = NULL;
u8 proto;
+ __be16 frag_off;
struct flowi6 fl6;
if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
@@ -58,7 +59,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
}
proto = oip6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off);
if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
pr_debug("Cannot get TCP header.\n");
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0e381bb94683..09412baf1ca6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -247,9 +247,9 @@ static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
{
struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
- if (rt != NULL)
+ if (rt)
memset(&rt->rt6i_table, 0,
- sizeof(*rt) - sizeof(struct dst_entry));
+ sizeof(*rt) - sizeof(struct dst_entry));
return rt;
}
@@ -263,7 +263,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
if (!(rt->dst.flags & DST_HOST))
dst_destroy_metrics_generic(dst);
- if (idev != NULL) {
+ if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
@@ -299,10 +299,10 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
struct net_device *loopback_dev =
dev_net(dev)->loopback_dev;
- if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
+ if (dev != loopback_dev && idev && idev->dev == dev) {
struct inet6_dev *loopback_idev =
in6_dev_get(loopback_dev);
- if (loopback_idev != NULL) {
+ if (loopback_idev) {
rt->rt6i_idev = loopback_idev;
in6_dev_put(idev);
}
@@ -344,7 +344,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
if (dev->ifindex == oif)
return sprt;
if (dev->flags & IFF_LOOPBACK) {
- if (sprt->rt6i_idev == NULL ||
+ if (!sprt->rt6i_idev ||
sprt->rt6i_idev->dev->ifindex != oif) {
if (flags & RT6_LOOKUP_F_IFACE && oif)
continue;
@@ -385,7 +385,7 @@ static void rt6_probe(struct rt6_info *rt)
* to no more than one per minute.
*/
rcu_read_lock();
- neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
+ neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
if (!neigh || (neigh->nud_state & NUD_VALID))
goto out;
read_lock_bh(&neigh->lock);
@@ -432,7 +432,7 @@ static inline int rt6_check_neigh(struct rt6_info *rt)
int m;
rcu_read_lock();
- neigh = dst_get_neighbour(&rt->dst);
+ neigh = dst_get_neighbour_noref(&rt->dst);
if (rt->rt6i_flags & RTF_NONEXTHOP ||
!(rt->rt6i_flags & RTF_GATEWAY))
m = 1;
@@ -636,7 +636,7 @@ do { \
goto restart; \
} \
} \
-} while(0)
+} while (0)
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
@@ -727,7 +727,7 @@ static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
struct neighbour *neigh;
int attempts = !in_softirq();
- if (!(rt->rt6i_flags&RTF_GATEWAY)) {
+ if (!(rt->rt6i_flags & RTF_GATEWAY)) {
if (rt->rt6i_dst.plen != 128 &&
ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
@@ -744,7 +744,8 @@ static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
#endif
retry:
- neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+ neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
+ rt->rt6i_dev);
if (IS_ERR(neigh)) {
struct net *net = dev_net(rt->rt6i_dev);
int saved_rt_min_interval =
@@ -785,7 +786,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
if (rt) {
rt->rt6i_flags |= RTF_CACHE;
- dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
+ dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
}
return rt;
}
@@ -819,7 +820,7 @@ restart:
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
- if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
+ if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
else if (!(rt->dst.flags & DST_HOST))
nrt = rt6_alloc_clone(rt, &fl6->daddr);
@@ -875,7 +876,7 @@ void ip6_route_input(struct sk_buff *skb)
.flowi6_iif = skb->dev->ifindex,
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
+ .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
.flowi6_mark = skb->mark,
.flowi6_proto = iph->nexthdr,
};
@@ -997,7 +998,7 @@ static void ip6_link_failure(struct sk_buff *skb)
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
- if (rt->rt6i_flags&RTF_CACHE) {
+ if (rt->rt6i_flags & RTF_CACHE) {
dst_set_expires(&rt->dst, 0);
rt->rt6i_flags |= RTF_EXPIRES;
} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
@@ -1073,11 +1074,11 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
struct inet6_dev *idev = in6_dev_get(dev);
struct net *net = dev_net(dev);
- if (unlikely(idev == NULL))
+ if (unlikely(!idev))
return NULL;
rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
- if (unlikely(rt == NULL)) {
+ if (unlikely(!rt)) {
in6_dev_put(idev);
goto out;
}
@@ -1085,7 +1086,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
if (neigh)
neigh_hold(neigh);
else {
- neigh = ndisc_get_neigh(dev, addr);
+ neigh = __neigh_lookup_errno(&nd_tbl, addr, dev);
if (IS_ERR(neigh))
neigh = NULL;
}
@@ -1238,23 +1239,23 @@ int ip6_route_add(struct fib6_config *cfg)
cfg->fc_metric = IP6_RT_PRIO_USER;
err = -ENOBUFS;
- if (NULL != cfg->fc_nlinfo.nlh &&
- !(cfg->fc_nlinfo.nlh->nlmsg_flags&NLM_F_CREATE)) {
+ if (cfg->fc_nlinfo.nlh &&
+ !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
table = fib6_get_table(net, cfg->fc_table);
- if (table == NULL) {
+ if (!table) {
printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
table = fib6_new_table(net, cfg->fc_table);
}
} else {
table = fib6_new_table(net, cfg->fc_table);
}
- if (table == NULL) {
+
+ if (!table)
goto out;
- }
rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
- if (rt == NULL) {
+ if (!rt) {
err = -ENOMEM;
goto out;
}
@@ -1303,8 +1304,9 @@ int ip6_route_add(struct fib6_config *cfg)
they would result in kernel looping; promote them to reject routes
*/
if ((cfg->fc_flags & RTF_REJECT) ||
- (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
- && !(cfg->fc_flags&RTF_LOCAL))) {
+ (dev && (dev->flags & IFF_LOOPBACK) &&
+ !(addr_type & IPV6_ADDR_LOOPBACK) &&
+ !(cfg->fc_flags & RTF_LOCAL))) {
/* hold loopback dev/idev if we haven't done so. */
if (dev != net->loopback_dev) {
if (dev) {
@@ -1345,13 +1347,13 @@ int ip6_route_add(struct fib6_config *cfg)
some exceptions. --ANK
*/
err = -EINVAL;
- if (!(gwa_type&IPV6_ADDR_UNICAST))
+ if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
err = -EHOSTUNREACH;
- if (grt == NULL)
+ if (!grt)
goto out;
if (dev) {
if (dev != grt->rt6i_dev) {
@@ -1364,7 +1366,7 @@ int ip6_route_add(struct fib6_config *cfg)
dev_hold(dev);
in6_dev_hold(grt->rt6i_idev);
}
- if (!(grt->rt6i_flags&RTF_GATEWAY))
+ if (!(grt->rt6i_flags & RTF_GATEWAY))
err = 0;
dst_release(&grt->dst);
@@ -1372,12 +1374,12 @@ int ip6_route_add(struct fib6_config *cfg)
goto out;
}
err = -EINVAL;
- if (dev == NULL || (dev->flags&IFF_LOOPBACK))
+ if (!dev || (dev->flags & IFF_LOOPBACK))
goto out;
}
err = -ENODEV;
- if (dev == NULL)
+ if (!dev)
goto out;
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
@@ -1474,7 +1476,7 @@ static int ip6_route_del(struct fib6_config *cfg)
int err = -ESRCH;
table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
- if (table == NULL)
+ if (!table)
return err;
read_lock_bh(&table->tb6_lock);
@@ -1486,7 +1488,7 @@ static int ip6_route_del(struct fib6_config *cfg)
if (fn) {
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
if (cfg->fc_ifindex &&
- (rt->rt6i_dev == NULL ||
+ (!rt->rt6i_dev ||
rt->rt6i_dev->ifindex != cfg->fc_ifindex))
continue;
if (cfg->fc_flags & RTF_GATEWAY &&
@@ -1627,11 +1629,11 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
dst_confirm(&rt->dst);
/* Duplicate redirect: silently ignore. */
- if (neigh == dst_get_neighbour_raw(&rt->dst))
+ if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
goto out;
nrt = ip6_rt_copy(rt, dest);
- if (nrt == NULL)
+ if (!nrt)
goto out;
nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
@@ -1648,7 +1650,7 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
netevent.new = &nrt->dst;
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
- if (rt->rt6i_flags&RTF_CACHE) {
+ if (rt->rt6i_flags & RTF_CACHE) {
ip6_del_rt(rt);
return;
}
@@ -1669,7 +1671,7 @@ static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr
int allfrag = 0;
again:
rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
- if (rt == NULL)
+ if (!rt)
return;
if (rt6_check_expired(rt)) {
@@ -1719,7 +1721,7 @@ again:
1. It is connected route. Action: COW
2. It is gatewayed route or NONEXTHOP route. Action: clone it.
*/
- if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
+ if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
nrt = rt6_alloc_cow(rt, daddr, saddr);
else
nrt = rt6_alloc_clone(rt, daddr);
@@ -1817,7 +1819,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
struct fib6_table *table;
table = fib6_get_table(net, RT6_TABLE_INFO);
- if (table == NULL)
+ if (!table)
return NULL;
write_lock_bh(&table->tb6_lock);
@@ -1876,7 +1878,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
struct fib6_table *table;
table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
- if (table == NULL)
+ if (!table)
return NULL;
write_lock_bh(&table->tb6_lock);
@@ -1921,7 +1923,7 @@ void rt6_purge_dflt_routers(struct net *net)
/* NOTE: Keep consistent with rt6_get_dflt_router */
table = fib6_get_table(net, RT6_TABLE_DFLT);
- if (table == NULL)
+ if (!table)
return;
restart:
@@ -2061,7 +2063,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
net->loopback_dev, 0);
struct neighbour *neigh;
- if (rt == NULL) {
+ if (!rt) {
if (net_ratelimit())
pr_warning("IPv6: Maximum number of routes reached,"
" consider increasing route/max_size.\n");
@@ -2081,7 +2083,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
rt->rt6i_flags |= RTF_ANYCAST;
else
rt->rt6i_flags |= RTF_LOCAL;
- neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+ neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
if (IS_ERR(neigh)) {
dst_free(&rt->dst);
@@ -2127,7 +2129,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
+ if (((void *)rt->rt6i_dev == dev || !dev) &&
rt != net->ipv6.ip6_null_entry &&
ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
/* remove prefsrc entry */
@@ -2157,7 +2159,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
const struct arg_dev_net *adn = arg;
const struct net_device *dev = adn->dev;
- if ((rt->rt6i_dev == dev || dev == NULL) &&
+ if ((rt->rt6i_dev == dev || !dev) &&
rt != adn->net->ipv6.ip6_null_entry) {
RT6_TRACE("deleted by ifdown %p\n", rt);
return -1;
@@ -2194,7 +2196,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
*/
idev = __in6_dev_get(arg->dev);
- if (idev == NULL)
+ if (!idev)
return 0;
/* For administrative MTU increase, there is no way to discover
@@ -2374,7 +2376,7 @@ static int rt6_fill_node(struct net *net,
}
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
- if (nlh == NULL)
+ if (!nlh)
return -EMSGSIZE;
rtm = nlmsg_data(nlh);
@@ -2388,25 +2390,25 @@ static int rt6_fill_node(struct net *net,
table = RT6_TABLE_UNSPEC;
rtm->rtm_table = table;
NLA_PUT_U32(skb, RTA_TABLE, table);
- if (rt->rt6i_flags&RTF_REJECT)
+ if (rt->rt6i_flags & RTF_REJECT)
rtm->rtm_type = RTN_UNREACHABLE;
- else if (rt->rt6i_flags&RTF_LOCAL)
+ else if (rt->rt6i_flags & RTF_LOCAL)
rtm->rtm_type = RTN_LOCAL;
- else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
+ else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
rtm->rtm_type = RTN_LOCAL;
else
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_flags = 0;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->rt6i_protocol;
- if (rt->rt6i_flags&RTF_DYNAMIC)
+ if (rt->rt6i_flags & RTF_DYNAMIC)
rtm->rtm_protocol = RTPROT_REDIRECT;
else if (rt->rt6i_flags & RTF_ADDRCONF)
rtm->rtm_protocol = RTPROT_KERNEL;
- else if (rt->rt6i_flags&RTF_DEFAULT)
+ else if (rt->rt6i_flags & RTF_DEFAULT)
rtm->rtm_protocol = RTPROT_RA;
- if (rt->rt6i_flags&RTF_CACHE)
+ if (rt->rt6i_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
if (dst) {
@@ -2454,7 +2456,7 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
rcu_read_lock();
- n = dst_get_neighbour(&rt->dst);
+ n = dst_get_neighbour_noref(&rt->dst);
if (n)
NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
rcu_read_unlock();
@@ -2546,7 +2548,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
}
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (skb == NULL) {
+ if (!skb) {
err = -ENOBUFS;
goto errout;
}
@@ -2581,10 +2583,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
int err;
err = -ENOBUFS;
- seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
+ seq = info->nlh ? info->nlh->nlmsg_seq : 0;
skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
- if (skb == NULL)
+ if (!skb)
goto errout;
err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
@@ -2651,7 +2653,7 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
seq_puts(m, "00000000000000000000000000000000 00 ");
#endif
rcu_read_lock();
- n = dst_get_neighbour(&rt->dst);
+ n = dst_get_neighbour_noref(&rt->dst);
if (n) {
seq_printf(m, "%pi6", n->primary_key);
} else {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 50968f226e75..b7d14cc12ee8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -680,7 +680,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct neighbour *neigh = NULL;
if (skb_dst(skb))
- neigh = dst_get_neighbour(skb_dst(skb));
+ neigh = dst_get_neighbour_noref(skb_dst(skb));
if (neigh == NULL) {
if (net_ratelimit())
@@ -705,7 +705,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct neighbour *neigh = NULL;
if (skb_dst(skb))
- neigh = dst_get_neighbour(skb_dst(skb));
+ neigh = dst_get_neighbour_noref(skb_dst(skb));
if (neigh == NULL) {
if (net_ratelimit())
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 052579fe389a..b71a6e7ab0a5 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -116,9 +116,11 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
{
int protoff;
u8 nexthdr;
+ __be16 frag_off;
nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
if (protoff < 0)
return false;
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 4bca15a0c385..ba92824086f3 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -98,6 +98,7 @@ static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
struct ipv6hdr _ip6h;
const struct ipv6hdr *ih;
u8 nexthdr;
+ __be16 frag_off;
int offset;
ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
@@ -108,7 +109,7 @@ static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
nexthdr = ih->nexthdr;
offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
- &nexthdr);
+ &nexthdr, &frag_off);
audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
&ih->saddr, &ih->daddr, nexthdr);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 3ecade3966d5..ba722621ed25 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -204,11 +204,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u8 nexthdr;
+ __be16 frag_off;
int tcphoff;
int ret;
nexthdr = ipv6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
if (tcphoff < 0)
return NF_DROP;
ret = tcpmss_mangle_packet(skb, par->targinfo,
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 9dc9ecfdd546..3a295cc734bd 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -87,9 +87,10 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
int tcphoff;
u_int8_t nexthdr;
+ __be16 frag_off;
nexthdr = ipv6h->nexthdr;
- tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
if (tcphoff < 0)
return NF_DROP;
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index dfd52bad1523..068698f64791 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -445,6 +445,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
{
__be16 _ports[2], *ports;
u8 nexthdr;
+ __be16 frag_off;
int poff;
memset(dst, 0, sizeof(*dst));
@@ -480,7 +481,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
(XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
return 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
if ((int)protoff < 0)
return -1;
break;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index fe39f7e913df..c302e30dc50c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -214,6 +214,7 @@ extract_icmp6_fields(const struct sk_buff *skb,
struct icmp6hdr *icmph, _icmph;
__be16 *ports, _ports[2];
u8 inside_nexthdr;
+ __be16 inside_fragoff;
int inside_hdrlen;
icmph = skb_header_pointer(skb, outside_hdrlen,
@@ -229,7 +230,8 @@ extract_icmp6_fields(const struct sk_buff *skb,
return 1;
inside_nexthdr = inside_iph->nexthdr;
- inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph),
+ &inside_nexthdr, &inside_fragoff);
if (inside_hdrlen < 0)
return 1; /* hjm: Packet has no/incomplete transport layer headers. */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 482fa571b4ee..28453ae2a97b 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -33,6 +33,14 @@ void genl_unlock(void)
}
EXPORT_SYMBOL(genl_unlock);
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_genl_is_held(void)
+{
+ return lockdep_is_held(&genl_mutex);
+}
+EXPORT_SYMBOL(lockdep_genl_is_held);
+#endif
+
#define GENL_FAM_TAB_SIZE 16
#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
@@ -946,3 +954,16 @@ int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group,
return genlmsg_mcast(skb, pid, group, flags);
}
EXPORT_SYMBOL(genlmsg_multicast_allns);
+
+void genl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *sk = net->genl_sock;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ nlmsg_notify(sk, skb, pid, group, report, flags);
+}
+EXPORT_SYMBOL(genl_notify);
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
new file mode 100644
index 000000000000..d9ea33c361be
--- /dev/null
+++ b/net/openvswitch/Kconfig
@@ -0,0 +1,28 @@
+#
+# Open vSwitch
+#
+
+config OPENVSWITCH
+ tristate "Open vSwitch"
+ ---help---
+ Open vSwitch is a multilayer Ethernet switch targeted at virtualized
+ environments. In addition to supporting a variety of features
+ expected in a traditional hardware switch, it enables fine-grained
+ programmatic extension and flow-based control of the network. This
+ control is useful in a wide variety of applications but is
+ particularly important in multi-server virtualization deployments,
+ which are often characterized by highly dynamic endpoints and the
+ need to maintain logical abstractions for multiple tenants.
+
+ The Open vSwitch datapath provides an in-kernel fast path for packet
+ forwarding. It is complemented by a userspace daemon, ovs-vswitchd,
+ which is able to accept configuration from a variety of sources and
+ translate it into packet processing rules.
+
+ See http://openvswitch.org for more information and userspace
+ utilities.
+
+ To compile this code as a module, choose M here: the module will be
+ called openvswitch.
+
+ If unsure, say N.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
new file mode 100644
index 000000000000..15e7384745c1
--- /dev/null
+++ b/net/openvswitch/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Open vSwitch.
+#
+
+obj-$(CONFIG_OPENVSWITCH) += openvswitch.o
+
+openvswitch-y := \
+ actions.o \
+ datapath.o \
+ dp_notify.o \
+ flow.o \
+ vport.o \
+ vport-internal_dev.o \
+ vport-netdev.o \
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
new file mode 100644
index 000000000000..2725d1bdf291
--- /dev/null
+++ b/net/openvswitch/actions.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ const struct nlattr *attr, int len, bool keep_skb);
+
+static int make_writable(struct sk_buff *skb, int write_len)
+{
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
+/* remove VLAN header from packet and update csum accrodingly. */
+static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
+{
+ struct vlan_hdr *vhdr;
+ int err;
+
+ err = make_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_sub(skb->csum, csum_partial(skb->data
+ + ETH_HLEN, VLAN_HLEN, 0));
+
+ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+ *current_tci = vhdr->h_vlan_TCI;
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+ __skb_pull(skb, VLAN_HLEN);
+
+ vlan_set_encap_proto(skb, vhdr);
+ skb->mac_header += VLAN_HLEN;
+ skb_reset_mac_len(skb);
+
+ return 0;
+}
+
+static int pop_vlan(struct sk_buff *skb)
+{
+ __be16 tci;
+ int err;
+
+ if (likely(vlan_tx_tag_present(skb))) {
+ skb->vlan_tci = 0;
+ } else {
+ if (unlikely(skb->protocol != htons(ETH_P_8021Q) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ err = __pop_vlan_tci(skb, &tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely(skb->protocol != htons(ETH_P_8021Q) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ err = __pop_vlan_tci(skb, &tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, ntohs(tci));
+ return 0;
+}
+
+static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan)
+{
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ u16 current_tag;
+
+ /* push down current VLAN tag */
+ current_tag = vlan_tx_tag_get(skb);
+
+ if (!__vlan_put_tag(skb, current_tag))
+ return -ENOMEM;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ + ETH_HLEN, VLAN_HLEN, 0));
+
+ }
+ __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+ return 0;
+}
+
+static int set_eth_addr(struct sk_buff *skb,
+ const struct ovs_key_ethernet *eth_key)
+{
+ int err;
+ err = make_writable(skb, ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN);
+ memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN);
+
+ return 0;
+}
+
+static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
+ __be32 *addr, __be32 new_addr)
+{
+ int transport_len = skb->len - skb_transport_offset(skb);
+
+ if (nh->protocol == IPPROTO_TCP) {
+ if (likely(transport_len >= sizeof(struct tcphdr)))
+ inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
+ *addr, new_addr, 1);
+ } else if (nh->protocol == IPPROTO_UDP) {
+ if (likely(transport_len >= sizeof(struct udphdr)))
+ inet_proto_csum_replace4(&udp_hdr(skb)->check, skb,
+ *addr, new_addr, 1);
+ }
+
+ csum_replace4(&nh->check, *addr, new_addr);
+ skb->rxhash = 0;
+ *addr = new_addr;
+}
+
+static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
+{
+ csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
+ nh->ttl = new_ttl;
+}
+
+static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
+{
+ struct iphdr *nh;
+ int err;
+
+ err = make_writable(skb, skb_network_offset(skb) +
+ sizeof(struct iphdr));
+ if (unlikely(err))
+ return err;
+
+ nh = ip_hdr(skb);
+
+ if (ipv4_key->ipv4_src != nh->saddr)
+ set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src);
+
+ if (ipv4_key->ipv4_dst != nh->daddr)
+ set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst);
+
+ if (ipv4_key->ipv4_tos != nh->tos)
+ ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos);
+
+ if (ipv4_key->ipv4_ttl != nh->ttl)
+ set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl);
+
+ return 0;
+}
+
+/* Must follow make_writable() since that can move the skb data. */
+static void set_tp_port(struct sk_buff *skb, __be16 *port,
+ __be16 new_port, __sum16 *check)
+{
+ inet_proto_csum_replace2(check, skb, *port, new_port, 0);
+ *port = new_port;
+ skb->rxhash = 0;
+}
+
+static int set_udp_port(struct sk_buff *skb,
+ const struct ovs_key_udp *udp_port_key)
+{
+ struct udphdr *uh;
+ int err;
+
+ err = make_writable(skb, skb_transport_offset(skb) +
+ sizeof(struct udphdr));
+ if (unlikely(err))
+ return err;
+
+ uh = udp_hdr(skb);
+ if (udp_port_key->udp_src != uh->source)
+ set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check);
+
+ if (udp_port_key->udp_dst != uh->dest)
+ set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check);
+
+ return 0;
+}
+
+static int set_tcp_port(struct sk_buff *skb,
+ const struct ovs_key_tcp *tcp_port_key)
+{
+ struct tcphdr *th;
+ int err;
+
+ err = make_writable(skb, skb_transport_offset(skb) +
+ sizeof(struct tcphdr));
+ if (unlikely(err))
+ return err;
+
+ th = tcp_hdr(skb);
+ if (tcp_port_key->tcp_src != th->source)
+ set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check);
+
+ if (tcp_port_key->tcp_dst != th->dest)
+ set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check);
+
+ return 0;
+}
+
+static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+{
+ struct vport *vport;
+
+ if (unlikely(!skb))
+ return -ENOMEM;
+
+ vport = rcu_dereference(dp->ports[out_port]);
+ if (unlikely(!vport)) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ ovs_vport_send(vport, skb);
+ return 0;
+}
+
+static int output_userspace(struct datapath *dp, struct sk_buff *skb,
+ const struct nlattr *attr)
+{
+ struct dp_upcall_info upcall;
+ const struct nlattr *a;
+ int rem;
+
+ upcall.cmd = OVS_PACKET_CMD_ACTION;
+ upcall.key = &OVS_CB(skb)->flow->key;
+ upcall.userdata = NULL;
+ upcall.pid = 0;
+
+ for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
+ a = nla_next(a, &rem)) {
+ switch (nla_type(a)) {
+ case OVS_USERSPACE_ATTR_USERDATA:
+ upcall.userdata = a;
+ break;
+
+ case OVS_USERSPACE_ATTR_PID:
+ upcall.pid = nla_get_u32(a);
+ break;
+ }
+ }
+
+ return ovs_dp_upcall(dp, skb, &upcall);
+}
+
+static int sample(struct datapath *dp, struct sk_buff *skb,
+ const struct nlattr *attr)
+{
+ const struct nlattr *acts_list = NULL;
+ const struct nlattr *a;
+ int rem;
+
+ for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
+ a = nla_next(a, &rem)) {
+ switch (nla_type(a)) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (net_random() >= nla_get_u32(a))
+ return 0;
+ break;
+
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ acts_list = a;
+ break;
+ }
+ }
+
+ return do_execute_actions(dp, skb, nla_data(acts_list),
+ nla_len(acts_list), true);
+}
+
+static int execute_set_action(struct sk_buff *skb,
+ const struct nlattr *nested_attr)
+{
+ int err = 0;
+
+ switch (nla_type(nested_attr)) {
+ case OVS_KEY_ATTR_PRIORITY:
+ skb->priority = nla_get_u32(nested_attr);
+ break;
+
+ case OVS_KEY_ATTR_ETHERNET:
+ err = set_eth_addr(skb, nla_data(nested_attr));
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ err = set_ipv4(skb, nla_data(nested_attr));
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ err = set_tcp_port(skb, nla_data(nested_attr));
+ break;
+
+ case OVS_KEY_ATTR_UDP:
+ err = set_udp_port(skb, nla_data(nested_attr));
+ break;
+ }
+
+ return err;
+}
+
+/* Execute a list of actions against 'skb'. */
+static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ const struct nlattr *attr, int len, bool keep_skb)
+{
+ /* Every output action needs a separate clone of 'skb', but the common
+ * case is just a single output action, so that doing a clone and
+ * then freeing the original skbuff is wasteful. So the following code
+ * is slightly obscure just to avoid that. */
+ int prev_port = -1;
+ const struct nlattr *a;
+ int rem;
+
+ for (a = attr, rem = len; rem > 0;
+ a = nla_next(a, &rem)) {
+ int err = 0;
+
+ if (prev_port != -1) {
+ do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
+ prev_port = -1;
+ }
+
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT:
+ prev_port = nla_get_u32(a);
+ break;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ output_userspace(dp, skb, a);
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ err = push_vlan(skb, nla_data(a));
+ if (unlikely(err)) /* skb already freed. */
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ err = pop_vlan(skb);
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = execute_set_action(skb, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample(dp, skb, a);
+ break;
+ }
+
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+
+ if (prev_port != -1) {
+ if (keep_skb)
+ skb = skb_clone(skb, GFP_ATOMIC);
+
+ do_output(dp, skb, prev_port);
+ } else if (!keep_skb)
+ consume_skb(skb);
+
+ return 0;
+}
+
+/* Execute a list of actions against 'skb'. */
+int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
+{
+ struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
+
+ return do_execute_actions(dp, skb, acts->actions,
+ acts->actions_len, false);
+}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
new file mode 100644
index 000000000000..9a2725114e99
--- /dev/null
+++ b/net/openvswitch/datapath.c
@@ -0,0 +1,1912 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/etherdevice.h>
+#include <linux/genetlink.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/version.h>
+#include <linux/ethtool.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+#include <asm/div64.h>
+#include <linux/highmem.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/inetdevice.h>
+#include <linux/list.h>
+#include <linux/openvswitch.h>
+#include <linux/rculist.h>
+#include <linux/dmi.h>
+#include <linux/workqueue.h>
+#include <net/genetlink.h>
+
+#include "datapath.h"
+#include "flow.h"
+#include "vport-internal_dev.h"
+
+/**
+ * DOC: Locking:
+ *
+ * Writes to device state (add/remove datapath, port, set operations on vports,
+ * etc.) are protected by RTNL.
+ *
+ * Writes to other state (flow table modifications, set miscellaneous datapath
+ * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside
+ * genl_mutex.
+ *
+ * Reads are protected by RCU.
+ *
+ * There are a few special cases (mostly stats) that have their own
+ * synchronization but they nest under all of above and don't interact with
+ * each other.
+ */
+
+/* Global list of datapaths to enable dumping them all out.
+ * Protected by genl_mutex.
+ */
+static LIST_HEAD(dps);
+
+#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
+static void rehash_flow_table(struct work_struct *work);
+static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
+
+static struct vport *new_vport(const struct vport_parms *);
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+ const struct dp_upcall_info *);
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
+ const struct dp_upcall_info *);
+
+/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
+static struct datapath *get_dp(int dp_ifindex)
+{
+ struct datapath *dp = NULL;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
+ if (dev) {
+ struct vport *vport = ovs_internal_dev_get_vport(dev);
+ if (vport)
+ dp = vport->dp;
+ }
+ rcu_read_unlock();
+
+ return dp;
+}
+
+/* Must be called with rcu_read_lock or RTNL lock. */
+const char *ovs_dp_name(const struct datapath *dp)
+{
+ struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
+ return vport->ops->get_name(vport);
+}
+
+static int get_dpifindex(struct datapath *dp)
+{
+ struct vport *local;
+ int ifindex;
+
+ rcu_read_lock();
+
+ local = rcu_dereference(dp->ports[OVSP_LOCAL]);
+ if (local)
+ ifindex = local->ops->get_ifindex(local);
+ else
+ ifindex = 0;
+
+ rcu_read_unlock();
+
+ return ifindex;
+}
+
+static void destroy_dp_rcu(struct rcu_head *rcu)
+{
+ struct datapath *dp = container_of(rcu, struct datapath, rcu);
+
+ ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
+ free_percpu(dp->stats_percpu);
+ kfree(dp);
+}
+
+/* Called with RTNL lock and genl_lock. */
+static struct vport *new_vport(const struct vport_parms *parms)
+{
+ struct vport *vport;
+
+ vport = ovs_vport_add(parms);
+ if (!IS_ERR(vport)) {
+ struct datapath *dp = parms->dp;
+
+ rcu_assign_pointer(dp->ports[parms->port_no], vport);
+ list_add(&vport->node, &dp->port_list);
+ }
+
+ return vport;
+}
+
+/* Called with RTNL lock. */
+void ovs_dp_detach_port(struct vport *p)
+{
+ ASSERT_RTNL();
+
+ /* First drop references to device. */
+ list_del(&p->node);
+ rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
+
+ /* Then destroy it. */
+ ovs_vport_del(p);
+}
+
+/* Must be called with rcu_read_lock. */
+void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
+{
+ struct datapath *dp = p->dp;
+ struct sw_flow *flow;
+ struct dp_stats_percpu *stats;
+ struct sw_flow_key key;
+ u64 *stats_counter;
+ int error;
+ int key_len;
+
+ stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+
+ /* Extract flow from 'skb' into 'key'. */
+ error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
+ if (unlikely(error)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Look up flow. */
+ flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
+ if (unlikely(!flow)) {
+ struct dp_upcall_info upcall;
+
+ upcall.cmd = OVS_PACKET_CMD_MISS;
+ upcall.key = &key;
+ upcall.userdata = NULL;
+ upcall.pid = p->upcall_pid;
+ ovs_dp_upcall(dp, skb, &upcall);
+ consume_skb(skb);
+ stats_counter = &stats->n_missed;
+ goto out;
+ }
+
+ OVS_CB(skb)->flow = flow;
+
+ stats_counter = &stats->n_hit;
+ ovs_flow_used(OVS_CB(skb)->flow, skb);
+ ovs_execute_actions(dp, skb);
+
+out:
+ /* Update datapath statistics. */
+ u64_stats_update_begin(&stats->sync);
+ (*stats_counter)++;
+ u64_stats_update_end(&stats->sync);
+}
+
+static struct genl_family dp_packet_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_PACKET_FAMILY,
+ .version = OVS_PACKET_VERSION,
+ .maxattr = OVS_PACKET_ATTR_MAX
+};
+
+int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct dp_stats_percpu *stats;
+ int dp_ifindex;
+ int err;
+
+ if (upcall_info->pid == 0) {
+ err = -ENOTCONN;
+ goto err;
+ }
+
+ dp_ifindex = get_dpifindex(dp);
+ if (!dp_ifindex) {
+ err = -ENODEV;
+ goto err;
+ }
+
+ if (!skb_is_gso(skb))
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ else
+ err = queue_gso_packets(dp_ifindex, skb, upcall_info);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
+
+ u64_stats_update_begin(&stats->sync);
+ stats->n_lost++;
+ u64_stats_update_end(&stats->sync);
+
+ return err;
+}
+
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct dp_upcall_info later_info;
+ struct sw_flow_key later_key;
+ struct sk_buff *segs, *nskb;
+ int err;
+
+ segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Queue all of the segments. */
+ skb = segs;
+ do {
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ if (err)
+ break;
+
+ if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+ /* The initial flow key extracted by ovs_flow_extract()
+ * in this case is for a first fragment, so we need to
+ * properly mark later fragments.
+ */
+ later_key = *upcall_info->key;
+ later_key.ip.frag = OVS_FRAG_TYPE_LATER;
+
+ later_info = *upcall_info;
+ later_info.key = &later_key;
+ upcall_info = &later_info;
+ }
+ } while ((skb = skb->next));
+
+ /* Free all of the segments. */
+ skb = segs;
+ do {
+ nskb = skb->next;
+ if (err)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ } while ((skb = nskb));
+ return err;
+}
+
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct ovs_header *upcall;
+ struct sk_buff *nskb = NULL;
+ struct sk_buff *user_skb; /* to be queued to userspace */
+ struct nlattr *nla;
+ unsigned int len;
+ int err;
+
+ if (vlan_tx_tag_present(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
+ if (!skb)
+ return -ENOMEM;
+
+ nskb->vlan_tci = 0;
+ skb = nskb;
+ }
+
+ if (nla_attr_size(skb->len) > USHRT_MAX) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ len = sizeof(struct ovs_header);
+ len += nla_total_size(skb->len);
+ len += nla_total_size(FLOW_BUFSIZE);
+ if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
+ len += nla_total_size(8);
+
+ user_skb = genlmsg_new(len, GFP_ATOMIC);
+ if (!user_skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
+ 0, upcall_info->cmd);
+ upcall->dp_ifindex = dp_ifindex;
+
+ nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
+ ovs_flow_to_nlattrs(upcall_info->key, user_skb);
+ nla_nest_end(user_skb, nla);
+
+ if (upcall_info->userdata)
+ nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
+ nla_get_u64(upcall_info->userdata));
+
+ nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+
+ skb_copy_and_csum_dev(skb, nla_data(nla));
+
+ err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
+
+out:
+ kfree_skb(nskb);
+ return err;
+}
+
+/* Called with genl_mutex. */
+static int flush_flows(int dp_ifindex)
+{
+ struct flow_table *old_table;
+ struct flow_table *new_table;
+ struct datapath *dp;
+
+ dp = get_dp(dp_ifindex);
+ if (!dp)
+ return -ENODEV;
+
+ old_table = genl_dereference(dp->table);
+ new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
+ if (!new_table)
+ return -ENOMEM;
+
+ rcu_assign_pointer(dp->table, new_table);
+
+ ovs_flow_tbl_deferred_destroy(old_table);
+ return 0;
+}
+
+static int validate_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth);
+
+static int validate_sample(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth)
+{
+ const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
+ const struct nlattr *probability, *actions;
+ const struct nlattr *a;
+ int rem;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
+ if (!probability || nla_len(probability) != sizeof(u32))
+ return -EINVAL;
+
+ actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+ return validate_actions(actions, key, depth + 1);
+}
+
+static int validate_set(const struct nlattr *a,
+ const struct sw_flow_key *flow_key)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+
+ /* There can be only one key in a action */
+ if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ return -EINVAL;
+
+ if (key_type > OVS_KEY_ATTR_MAX ||
+ nla_len(ovs_key) != ovs_key_lens[key_type])
+ return -EINVAL;
+
+ switch (key_type) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ case OVS_KEY_ATTR_PRIORITY:
+ case OVS_KEY_ATTR_ETHERNET:
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ if (flow_key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
+ return -EINVAL;
+
+ ipv4_key = nla_data(ovs_key);
+ if (ipv4_key->ipv4_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv4_key->ipv4_frag != flow_key->ip.frag)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ if (flow_key->ip.proto != IPPROTO_TCP)
+ return -EINVAL;
+
+ if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_UDP:
+ if (flow_key->ip.proto != IPPROTO_UDP)
+ return -EINVAL;
+
+ if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_userspace(const struct nlattr *attr)
+{
+ static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
+ [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
+ };
+ struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
+ int error;
+
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
+ attr, userspace_policy);
+ if (error)
+ return error;
+
+ if (!a[OVS_USERSPACE_ATTR_PID] ||
+ !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int validate_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ if (depth >= SAMPLE_ACTION_DEPTH)
+ return -EOVERFLOW;
+
+ nla_for_each_nested(a, attr, rem) {
+ /* Expected argument lengths, (u32)-1 for variable length. */
+ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
+ [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
+ [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
+ [OVS_ACTION_ATTR_POP_VLAN] = 0,
+ [OVS_ACTION_ATTR_SET] = (u32)-1,
+ [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
+ };
+ const struct ovs_action_push_vlan *vlan;
+ int type = nla_type(a);
+
+ if (type > OVS_ACTION_ATTR_MAX ||
+ (action_lens[type] != nla_len(a) &&
+ action_lens[type] != (u32)-1))
+ return -EINVAL;
+
+ switch (type) {
+ case OVS_ACTION_ATTR_UNSPEC:
+ return -EINVAL;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ err = validate_userspace(a);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ vlan = nla_data(a);
+ if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ return -EINVAL;
+ if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ return -EINVAL;
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = validate_set(a, key);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = validate_sample(a, key, depth);
+ if (err)
+ return err;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void clear_stats(struct sw_flow *flow)
+{
+ flow->used = 0;
+ flow->tcp_flags = 0;
+ flow->packet_count = 0;
+ flow->byte_count = 0;
+}
+
+static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ovs_header *ovs_header = info->userhdr;
+ struct nlattr **a = info->attrs;
+ struct sw_flow_actions *acts;
+ struct sk_buff *packet;
+ struct sw_flow *flow;
+ struct datapath *dp;
+ struct ethhdr *eth;
+ int len;
+ int err;
+ int key_len;
+
+ err = -EINVAL;
+ if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
+ !a[OVS_PACKET_ATTR_ACTIONS] ||
+ nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
+ goto err;
+
+ len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
+ packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!packet)
+ goto err;
+ skb_reserve(packet, NET_IP_ALIGN);
+
+ memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
+
+ skb_reset_mac_header(packet);
+ eth = eth_hdr(packet);
+
+ /* Normally, setting the skb 'protocol' field would be handled by a
+ * call to eth_type_trans(), but it assumes there's a sending
+ * device, which we may not have. */
+ if (ntohs(eth->h_proto) >= 1536)
+ packet->protocol = eth->h_proto;
+ else
+ packet->protocol = htons(ETH_P_802_2);
+
+ /* Build an sw_flow for sending this packet. */
+ flow = ovs_flow_alloc();
+ err = PTR_ERR(flow);
+ if (IS_ERR(flow))
+ goto err_kfree_skb;
+
+ err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
+ if (err)
+ goto err_flow_free;
+
+ err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
+ &flow->key.phy.in_port,
+ a[OVS_PACKET_ATTR_KEY]);
+ if (err)
+ goto err_flow_free;
+
+ err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
+ if (err)
+ goto err_flow_free;
+
+ flow->hash = ovs_flow_hash(&flow->key, key_len);
+
+ acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
+ err = PTR_ERR(acts);
+ if (IS_ERR(acts))
+ goto err_flow_free;
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ OVS_CB(packet)->flow = flow;
+ packet->priority = flow->key.phy.priority;
+
+ rcu_read_lock();
+ dp = get_dp(ovs_header->dp_ifindex);
+ err = -ENODEV;
+ if (!dp)
+ goto err_unlock;
+
+ local_bh_disable();
+ err = ovs_execute_actions(dp, packet);
+ local_bh_enable();
+ rcu_read_unlock();
+
+ ovs_flow_free(flow);
+ return err;
+
+err_unlock:
+ rcu_read_unlock();
+err_flow_free:
+ ovs_flow_free(flow);
+err_kfree_skb:
+ kfree_skb(packet);
+err:
+ return err;
+}
+
+static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
+ [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
+ [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
+ [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
+};
+
+static struct genl_ops dp_packet_genl_ops[] = {
+ { .cmd = OVS_PACKET_CMD_EXECUTE,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = packet_policy,
+ .doit = ovs_packet_cmd_execute
+ }
+};
+
+static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
+{
+ int i;
+ struct flow_table *table = genl_dereference(dp->table);
+
+ stats->n_flows = ovs_flow_tbl_count(table);
+
+ stats->n_hit = stats->n_missed = stats->n_lost = 0;
+ for_each_possible_cpu(i) {
+ const struct dp_stats_percpu *percpu_stats;
+ struct dp_stats_percpu local_stats;
+ unsigned int start;
+
+ percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
+
+ do {
+ start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
+ local_stats = *percpu_stats;
+ } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
+
+ stats->n_hit += local_stats.n_hit;
+ stats->n_missed += local_stats.n_missed;
+ stats->n_lost += local_stats.n_lost;
+ }
+}
+
+static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
+ [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
+};
+
+static struct genl_family dp_flow_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_FLOW_FAMILY,
+ .version = OVS_FLOW_VERSION,
+ .maxattr = OVS_FLOW_ATTR_MAX
+};
+
+static struct genl_multicast_group ovs_dp_flow_multicast_group = {
+ .name = OVS_FLOW_MCGROUP
+};
+
+/* Called with genl_lock. */
+static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
+ struct sk_buff *skb, u32 pid,
+ u32 seq, u32 flags, u8 cmd)
+{
+ const int skb_orig_len = skb->len;
+ const struct sw_flow_actions *sf_acts;
+ struct ovs_flow_stats stats;
+ struct ovs_header *ovs_header;
+ struct nlattr *nla;
+ unsigned long used;
+ u8 tcp_flags;
+ int err;
+
+ sf_acts = rcu_dereference_protected(flow->sf_acts,
+ lockdep_genl_is_held());
+
+ ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
+ if (!ovs_header)
+ return -EMSGSIZE;
+
+ ovs_header->dp_ifindex = get_dpifindex(dp);
+
+ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
+ if (!nla)
+ goto nla_put_failure;
+ err = ovs_flow_to_nlattrs(&flow->key, skb);
+ if (err)
+ goto error;
+ nla_nest_end(skb, nla);
+
+ spin_lock_bh(&flow->lock);
+ used = flow->used;
+ stats.n_packets = flow->packet_count;
+ stats.n_bytes = flow->byte_count;
+ tcp_flags = flow->tcp_flags;
+ spin_unlock_bh(&flow->lock);
+
+ if (used)
+ NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
+
+ if (stats.n_packets)
+ NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
+ sizeof(struct ovs_flow_stats), &stats);
+
+ if (tcp_flags)
+ NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
+
+ /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
+ * this is the first flow to be dumped into 'skb'. This is unusual for
+ * Netlink but individual action lists can be longer than
+ * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
+ * The userspace caller can always fetch the actions separately if it
+ * really wants them. (Most userspace callers in fact don't care.)
+ *
+ * This can only fail for dump operations because the skb is always
+ * properly sized for single flows.
+ */
+ err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
+ sf_acts->actions);
+ if (err < 0 && skb_orig_len)
+ goto error;
+
+ return genlmsg_end(skb, ovs_header);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+error:
+ genlmsg_cancel(skb, ovs_header);
+ return err;
+}
+
+static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
+{
+ const struct sw_flow_actions *sf_acts;
+ int len;
+
+ sf_acts = rcu_dereference_protected(flow->sf_acts,
+ lockdep_genl_is_held());
+
+ /* OVS_FLOW_ATTR_KEY */
+ len = nla_total_size(FLOW_BUFSIZE);
+ /* OVS_FLOW_ATTR_ACTIONS */
+ len += nla_total_size(sf_acts->actions_len);
+ /* OVS_FLOW_ATTR_STATS */
+ len += nla_total_size(sizeof(struct ovs_flow_stats));
+ /* OVS_FLOW_ATTR_TCP_FLAGS */
+ len += nla_total_size(1);
+ /* OVS_FLOW_ATTR_USED */
+ len += nla_total_size(8);
+
+ len += NLMSG_ALIGN(sizeof(struct ovs_header));
+
+ return genlmsg_new(len, GFP_KERNEL);
+}
+
+static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
+ struct datapath *dp,
+ u32 pid, u32 seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int retval;
+
+ skb = ovs_flow_cmd_alloc_info(flow);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
+ BUG_ON(retval < 0);
+ return skb;
+}
+
+static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sw_flow *flow;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ struct flow_table *table;
+ int error;
+ int key_len;
+
+ /* Extract key. */
+ error = -EINVAL;
+ if (!a[OVS_FLOW_ATTR_KEY])
+ goto error;
+ error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+ if (error)
+ goto error;
+
+ /* Validate actions. */
+ if (a[OVS_FLOW_ATTR_ACTIONS]) {
+ error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
+ if (error)
+ goto error;
+ } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
+ error = -EINVAL;
+ goto error;
+ }
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ error = -ENODEV;
+ if (!dp)
+ goto error;
+
+ table = genl_dereference(dp->table);
+ flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ if (!flow) {
+ struct sw_flow_actions *acts;
+
+ /* Bail out if we're not allowed to create a new flow. */
+ error = -ENOENT;
+ if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
+ goto error;
+
+ /* Expand table, if necessary, to make room. */
+ if (ovs_flow_tbl_need_to_expand(table)) {
+ struct flow_table *new_table;
+
+ new_table = ovs_flow_tbl_expand(table);
+ if (!IS_ERR(new_table)) {
+ rcu_assign_pointer(dp->table, new_table);
+ ovs_flow_tbl_deferred_destroy(table);
+ table = genl_dereference(dp->table);
+ }
+ }
+
+ /* Allocate flow. */
+ flow = ovs_flow_alloc();
+ if (IS_ERR(flow)) {
+ error = PTR_ERR(flow);
+ goto error;
+ }
+ flow->key = key;
+ clear_stats(flow);
+
+ /* Obtain actions. */
+ acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
+ error = PTR_ERR(acts);
+ if (IS_ERR(acts))
+ goto error_free_flow;
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ /* Put flow in bucket. */
+ flow->hash = ovs_flow_hash(&key, key_len);
+ ovs_flow_tbl_insert(table, flow);
+
+ reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
+ info->snd_seq,
+ OVS_FLOW_CMD_NEW);
+ } else {
+ /* We found a matching flow. */
+ struct sw_flow_actions *old_acts;
+ struct nlattr *acts_attrs;
+
+ /* Bail out if we're not allowed to modify an existing flow.
+ * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
+ * because Generic Netlink treats the latter as a dump
+ * request. We also accept NLM_F_EXCL in case that bug ever
+ * gets fixed.
+ */
+ error = -EEXIST;
+ if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
+ info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
+ goto error;
+
+ /* Update actions. */
+ old_acts = rcu_dereference_protected(flow->sf_acts,
+ lockdep_genl_is_held());
+ acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
+ if (acts_attrs &&
+ (old_acts->actions_len != nla_len(acts_attrs) ||
+ memcmp(old_acts->actions, nla_data(acts_attrs),
+ old_acts->actions_len))) {
+ struct sw_flow_actions *new_acts;
+
+ new_acts = ovs_flow_actions_alloc(acts_attrs);
+ error = PTR_ERR(new_acts);
+ if (IS_ERR(new_acts))
+ goto error;
+
+ rcu_assign_pointer(flow->sf_acts, new_acts);
+ ovs_flow_deferred_free_acts(old_acts);
+ }
+
+ reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
+ info->snd_seq, OVS_FLOW_CMD_NEW);
+
+ /* Clear stats. */
+ if (a[OVS_FLOW_ATTR_CLEAR]) {
+ spin_lock_bh(&flow->lock);
+ clear_stats(flow);
+ spin_unlock_bh(&flow->lock);
+ }
+ }
+
+ if (!IS_ERR(reply))
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_flow_multicast_group.id, info->nlhdr,
+ GFP_KERNEL);
+ else
+ netlink_set_err(init_net.genl_sock, 0,
+ ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
+ return 0;
+
+error_free_flow:
+ ovs_flow_free(flow);
+error:
+ return error;
+}
+
+static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sk_buff *reply;
+ struct sw_flow *flow;
+ struct datapath *dp;
+ struct flow_table *table;
+ int err;
+ int key_len;
+
+ if (!a[OVS_FLOW_ATTR_KEY])
+ return -EINVAL;
+ err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+ if (err)
+ return err;
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ if (!dp)
+ return -ENODEV;
+
+ table = genl_dereference(dp->table);
+ flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ if (!flow)
+ return -ENOENT;
+
+ reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
+ info->snd_seq, OVS_FLOW_CMD_NEW);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ return genlmsg_reply(reply, info);
+}
+
+static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sk_buff *reply;
+ struct sw_flow *flow;
+ struct datapath *dp;
+ struct flow_table *table;
+ int err;
+ int key_len;
+
+ if (!a[OVS_FLOW_ATTR_KEY])
+ return flush_flows(ovs_header->dp_ifindex);
+ err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+ if (err)
+ return err;
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ if (!dp)
+ return -ENODEV;
+
+ table = genl_dereference(dp->table);
+ flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ if (!flow)
+ return -ENOENT;
+
+ reply = ovs_flow_cmd_alloc_info(flow);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_flow_tbl_remove(table, flow);
+
+ err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
+ info->snd_seq, 0, OVS_FLOW_CMD_DEL);
+ BUG_ON(err < 0);
+
+ ovs_flow_deferred_free(flow);
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
+ return 0;
+}
+
+static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct datapath *dp;
+ struct flow_table *table;
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ if (!dp)
+ return -ENODEV;
+
+ table = genl_dereference(dp->table);
+
+ for (;;) {
+ struct sw_flow *flow;
+ u32 bucket, obj;
+
+ bucket = cb->args[0];
+ obj = cb->args[1];
+ flow = ovs_flow_tbl_next(table, &bucket, &obj);
+ if (!flow)
+ break;
+
+ if (ovs_flow_cmd_fill_info(flow, dp, skb,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ OVS_FLOW_CMD_NEW) < 0)
+ break;
+
+ cb->args[0] = bucket;
+ cb->args[1] = obj;
+ }
+ return skb->len;
+}
+
+static struct genl_ops dp_flow_genl_ops[] = {
+ { .cmd = OVS_FLOW_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_new_or_set
+ },
+ { .cmd = OVS_FLOW_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_del
+ },
+ { .cmd = OVS_FLOW_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_get,
+ .dumpit = ovs_flow_cmd_dump
+ },
+ { .cmd = OVS_FLOW_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_new_or_set,
+ },
+};
+
+static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
+ [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+};
+
+static struct genl_family dp_datapath_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_DATAPATH_FAMILY,
+ .version = OVS_DATAPATH_VERSION,
+ .maxattr = OVS_DP_ATTR_MAX
+};
+
+static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
+ .name = OVS_DATAPATH_MCGROUP
+};
+
+static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
+ u32 pid, u32 seq, u32 flags, u8 cmd)
+{
+ struct ovs_header *ovs_header;
+ struct ovs_dp_stats dp_stats;
+ int err;
+
+ ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
+ flags, cmd);
+ if (!ovs_header)
+ goto error;
+
+ ovs_header->dp_ifindex = get_dpifindex(dp);
+
+ rcu_read_lock();
+ err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ get_dp_stats(dp, &dp_stats);
+ NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
+
+ return genlmsg_end(skb, ovs_header);
+
+nla_put_failure:
+ genlmsg_cancel(skb, ovs_header);
+error:
+ return -EMSGSIZE;
+}
+
+static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
+ u32 seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int retval;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
+ if (retval < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(retval);
+ }
+ return skb;
+}
+
+/* Called with genl_mutex and optionally with RTNL lock also. */
+static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
+ struct nlattr *a[OVS_DP_ATTR_MAX + 1])
+{
+ struct datapath *dp;
+
+ if (!a[OVS_DP_ATTR_NAME])
+ dp = get_dp(ovs_header->dp_ifindex);
+ else {
+ struct vport *vport;
+
+ rcu_read_lock();
+ vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
+ dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
+ rcu_read_unlock();
+ }
+ return dp ? dp : ERR_PTR(-ENODEV);
+}
+
+static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct vport_parms parms;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ struct vport *vport;
+ int err;
+
+ err = -EINVAL;
+ if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
+ goto err;
+
+ rtnl_lock();
+ err = -ENODEV;
+ if (!try_module_get(THIS_MODULE))
+ goto err_unlock_rtnl;
+
+ err = -ENOMEM;
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (dp == NULL)
+ goto err_put_module;
+ INIT_LIST_HEAD(&dp->port_list);
+
+ /* Allocate table. */
+ err = -ENOMEM;
+ rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
+ if (!dp->table)
+ goto err_free_dp;
+
+ dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
+ if (!dp->stats_percpu) {
+ err = -ENOMEM;
+ goto err_destroy_table;
+ }
+
+ /* Set up our datapath device. */
+ parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
+ parms.type = OVS_VPORT_TYPE_INTERNAL;
+ parms.options = NULL;
+ parms.dp = dp;
+ parms.port_no = OVSP_LOCAL;
+ parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
+
+ vport = new_vport(&parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ if (err == -EBUSY)
+ err = -EEXIST;
+
+ goto err_destroy_percpu;
+ }
+
+ reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
+ info->snd_seq, OVS_DP_CMD_NEW);
+ err = PTR_ERR(reply);
+ if (IS_ERR(reply))
+ goto err_destroy_local_port;
+
+ list_add_tail(&dp->list_node, &dps);
+ rtnl_unlock();
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_datapath_multicast_group.id, info->nlhdr,
+ GFP_KERNEL);
+ return 0;
+
+err_destroy_local_port:
+ ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
+err_destroy_percpu:
+ free_percpu(dp->stats_percpu);
+err_destroy_table:
+ ovs_flow_tbl_destroy(genl_dereference(dp->table));
+err_free_dp:
+ kfree(dp);
+err_put_module:
+ module_put(THIS_MODULE);
+err_unlock_rtnl:
+ rtnl_unlock();
+err:
+ return err;
+}
+
+static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct vport *vport, *next_vport;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err;
+
+ rtnl_lock();
+ dp = lookup_datapath(info->userhdr, info->attrs);
+ err = PTR_ERR(dp);
+ if (IS_ERR(dp))
+ goto exit_unlock;
+
+ reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
+ info->snd_seq, OVS_DP_CMD_DEL);
+ err = PTR_ERR(reply);
+ if (IS_ERR(reply))
+ goto exit_unlock;
+
+ list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
+ if (vport->port_no != OVSP_LOCAL)
+ ovs_dp_detach_port(vport);
+
+ list_del(&dp->list_node);
+ ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
+
+ /* rtnl_unlock() will wait until all the references to devices that
+ * are pending unregistration have been dropped. We do it here to
+ * ensure that any internal devices (which contain DP pointers) are
+ * fully destroyed before freeing the datapath.
+ */
+ rtnl_unlock();
+
+ call_rcu(&dp->rcu, destroy_dp_rcu);
+ module_put(THIS_MODULE);
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_datapath_multicast_group.id, info->nlhdr,
+ GFP_KERNEL);
+
+ return 0;
+
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err;
+
+ dp = lookup_datapath(info->userhdr, info->attrs);
+ if (IS_ERR(dp))
+ return PTR_ERR(dp);
+
+ reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
+ info->snd_seq, OVS_DP_CMD_NEW);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ netlink_set_err(init_net.genl_sock, 0,
+ ovs_dp_datapath_multicast_group.id, err);
+ return 0;
+ }
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_datapath_multicast_group.id, info->nlhdr,
+ GFP_KERNEL);
+
+ return 0;
+}
+
+static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *reply;
+ struct datapath *dp;
+
+ dp = lookup_datapath(info->userhdr, info->attrs);
+ if (IS_ERR(dp))
+ return PTR_ERR(dp);
+
+ reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
+ info->snd_seq, OVS_DP_CMD_NEW);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ return genlmsg_reply(reply, info);
+}
+
+static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct datapath *dp;
+ int skip = cb->args[0];
+ int i = 0;
+
+ list_for_each_entry(dp, &dps, list_node) {
+ if (i < skip)
+ continue;
+ if (ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ OVS_DP_CMD_NEW) < 0)
+ break;
+ i++;
+ }
+
+ cb->args[0] = i;
+
+ return skb->len;
+}
+
+static struct genl_ops dp_datapath_genl_ops[] = {
+ { .cmd = OVS_DP_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_new
+ },
+ { .cmd = OVS_DP_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_del
+ },
+ { .cmd = OVS_DP_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_get,
+ .dumpit = ovs_dp_cmd_dump
+ },
+ { .cmd = OVS_DP_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_set,
+ },
+};
+
+static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
+ [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
+ [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
+};
+
+static struct genl_family dp_vport_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_VPORT_FAMILY,
+ .version = OVS_VPORT_VERSION,
+ .maxattr = OVS_VPORT_ATTR_MAX
+};
+
+struct genl_multicast_group ovs_dp_vport_multicast_group = {
+ .name = OVS_VPORT_MCGROUP
+};
+
+/* Called with RTNL lock or RCU read lock. */
+static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
+ u32 pid, u32 seq, u32 flags, u8 cmd)
+{
+ struct ovs_header *ovs_header;
+ struct ovs_vport_stats vport_stats;
+ int err;
+
+ ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
+ flags, cmd);
+ if (!ovs_header)
+ return -EMSGSIZE;
+
+ ovs_header->dp_ifindex = get_dpifindex(vport->dp);
+
+ NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
+ NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
+ NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
+ NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
+
+ ovs_vport_get_stats(vport, &vport_stats);
+ NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
+ &vport_stats);
+
+ err = ovs_vport_get_options(vport, skb);
+ if (err == -EMSGSIZE)
+ goto error;
+
+ return genlmsg_end(skb, ovs_header);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+error:
+ genlmsg_cancel(skb, ovs_header);
+ return err;
+}
+
+/* Called with RTNL lock or RCU read lock. */
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
+ u32 seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int retval;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
+ if (retval < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(retval);
+ }
+ return skb;
+}
+
+/* Called with RTNL lock or RCU read lock. */
+static struct vport *lookup_vport(struct ovs_header *ovs_header,
+ struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
+{
+ struct datapath *dp;
+ struct vport *vport;
+
+ if (a[OVS_VPORT_ATTR_NAME]) {
+ vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
+ if (!vport)
+ return ERR_PTR(-ENODEV);
+ return vport;
+ } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
+ u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+
+ if (port_no >= DP_MAX_PORTS)
+ return ERR_PTR(-EFBIG);
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ if (!dp)
+ return ERR_PTR(-ENODEV);
+
+ vport = rcu_dereference_rtnl(dp->ports[port_no]);
+ if (!vport)
+ return ERR_PTR(-ENOENT);
+ return vport;
+ } else
+ return ERR_PTR(-EINVAL);
+}
+
+static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct vport_parms parms;
+ struct sk_buff *reply;
+ struct vport *vport;
+ struct datapath *dp;
+ u32 port_no;
+ int err;
+
+ err = -EINVAL;
+ if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
+ !a[OVS_VPORT_ATTR_UPCALL_PID])
+ goto exit;
+
+ rtnl_lock();
+ dp = get_dp(ovs_header->dp_ifindex);
+ err = -ENODEV;
+ if (!dp)
+ goto exit_unlock;
+
+ if (a[OVS_VPORT_ATTR_PORT_NO]) {
+ port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+
+ err = -EFBIG;
+ if (port_no >= DP_MAX_PORTS)
+ goto exit_unlock;
+
+ vport = rtnl_dereference(dp->ports[port_no]);
+ err = -EBUSY;
+ if (vport)
+ goto exit_unlock;
+ } else {
+ for (port_no = 1; ; port_no++) {
+ if (port_no >= DP_MAX_PORTS) {
+ err = -EFBIG;
+ goto exit_unlock;
+ }
+ vport = rtnl_dereference(dp->ports[port_no]);
+ if (!vport)
+ break;
+ }
+ }
+
+ parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
+ parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+ parms.options = a[OVS_VPORT_ATTR_OPTIONS];
+ parms.dp = dp;
+ parms.port_no = port_no;
+ parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
+
+ vport = new_vport(&parms);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock;
+
+ reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
+ OVS_VPORT_CMD_NEW);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ ovs_dp_detach_port(vport);
+ goto exit_unlock;
+ }
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+
+exit_unlock:
+ rtnl_unlock();
+exit:
+ return err;
+}
+
+static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ rtnl_lock();
+ vport = lookup_vport(info->userhdr, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock;
+
+ err = 0;
+ if (a[OVS_VPORT_ATTR_TYPE] &&
+ nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
+ err = -EINVAL;
+
+ if (!err && a[OVS_VPORT_ATTR_OPTIONS])
+ err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
+ if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
+ vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
+
+ reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
+ OVS_VPORT_CMD_NEW);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ netlink_set_err(init_net.genl_sock, 0,
+ ovs_dp_vport_multicast_group.id, err);
+ return 0;
+ }
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ rtnl_lock();
+ vport = lookup_vport(info->userhdr, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock;
+
+ if (vport->port_no == OVSP_LOCAL) {
+ err = -EINVAL;
+ goto exit_unlock;
+ }
+
+ reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
+ OVS_VPORT_CMD_DEL);
+ err = PTR_ERR(reply);
+ if (IS_ERR(reply))
+ goto exit_unlock;
+
+ ovs_dp_detach_port(vport);
+
+ genl_notify(reply, genl_info_net(info), info->snd_pid,
+ ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
+
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ rcu_read_lock();
+ vport = lookup_vport(ovs_header, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock;
+
+ reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
+ OVS_VPORT_CMD_NEW);
+ err = PTR_ERR(reply);
+ if (IS_ERR(reply))
+ goto exit_unlock;
+
+ rcu_read_unlock();
+
+ return genlmsg_reply(reply, info);
+
+exit_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct datapath *dp;
+ u32 port_no;
+ int retval;
+
+ dp = get_dp(ovs_header->dp_ifindex);
+ if (!dp)
+ return -ENODEV;
+
+ rcu_read_lock();
+ for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
+ struct vport *vport;
+
+ vport = rcu_dereference(dp->ports[port_no]);
+ if (!vport)
+ continue;
+
+ if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ OVS_VPORT_CMD_NEW) < 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = port_no;
+ retval = skb->len;
+
+ return retval;
+}
+
+static void rehash_flow_table(struct work_struct *work)
+{
+ struct datapath *dp;
+
+ genl_lock();
+
+ list_for_each_entry(dp, &dps, list_node) {
+ struct flow_table *old_table = genl_dereference(dp->table);
+ struct flow_table *new_table;
+
+ new_table = ovs_flow_tbl_rehash(old_table);
+ if (!IS_ERR(new_table)) {
+ rcu_assign_pointer(dp->table, new_table);
+ ovs_flow_tbl_deferred_destroy(old_table);
+ }
+ }
+
+ genl_unlock();
+
+ schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
+}
+
+static struct genl_ops dp_vport_genl_ops[] = {
+ { .cmd = OVS_VPORT_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_new
+ },
+ { .cmd = OVS_VPORT_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_del
+ },
+ { .cmd = OVS_VPORT_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_get,
+ .dumpit = ovs_vport_cmd_dump
+ },
+ { .cmd = OVS_VPORT_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_set,
+ },
+};
+
+struct genl_family_and_ops {
+ struct genl_family *family;
+ struct genl_ops *ops;
+ int n_ops;
+ struct genl_multicast_group *group;
+};
+
+static const struct genl_family_and_ops dp_genl_families[] = {
+ { &dp_datapath_genl_family,
+ dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
+ &ovs_dp_datapath_multicast_group },
+ { &dp_vport_genl_family,
+ dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
+ &ovs_dp_vport_multicast_group },
+ { &dp_flow_genl_family,
+ dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
+ &ovs_dp_flow_multicast_group },
+ { &dp_packet_genl_family,
+ dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
+ NULL },
+};
+
+static void dp_unregister_genl(int n_families)
+{
+ int i;
+
+ for (i = 0; i < n_families; i++)
+ genl_unregister_family(dp_genl_families[i].family);
+}
+
+static int dp_register_genl(void)
+{
+ int n_registered;
+ int err;
+ int i;
+
+ n_registered = 0;
+ for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
+ const struct genl_family_and_ops *f = &dp_genl_families[i];
+
+ err = genl_register_family_with_ops(f->family, f->ops,
+ f->n_ops);
+ if (err)
+ goto error;
+ n_registered++;
+
+ if (f->group) {
+ err = genl_register_mc_group(f->family, f->group);
+ if (err)
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ dp_unregister_genl(n_registered);
+ return err;
+}
+
+static int __init dp_init(void)
+{
+ struct sk_buff *dummy_skb;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
+
+ pr_info("Open vSwitch switching datapath\n");
+
+ err = ovs_flow_init();
+ if (err)
+ goto error;
+
+ err = ovs_vport_init();
+ if (err)
+ goto error_flow_exit;
+
+ err = register_netdevice_notifier(&ovs_dp_device_notifier);
+ if (err)
+ goto error_vport_exit;
+
+ err = dp_register_genl();
+ if (err < 0)
+ goto error_unreg_notifier;
+
+ schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
+
+ return 0;
+
+error_unreg_notifier:
+ unregister_netdevice_notifier(&ovs_dp_device_notifier);
+error_vport_exit:
+ ovs_vport_exit();
+error_flow_exit:
+ ovs_flow_exit();
+error:
+ return err;
+}
+
+static void dp_cleanup(void)
+{
+ cancel_delayed_work_sync(&rehash_flow_wq);
+ rcu_barrier();
+ dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
+ unregister_netdevice_notifier(&ovs_dp_device_notifier);
+ ovs_vport_exit();
+ ovs_flow_exit();
+}
+
+module_init(dp_init);
+module_exit(dp_cleanup);
+
+MODULE_DESCRIPTION("Open vSwitch switching datapath");
+MODULE_LICENSE("GPL");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
new file mode 100644
index 000000000000..5b9f884b7055
--- /dev/null
+++ b/net/openvswitch/datapath.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef DATAPATH_H
+#define DATAPATH_H 1
+
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/version.h>
+
+#include "flow.h"
+
+struct vport;
+
+#define DP_MAX_PORTS 1024
+#define SAMPLE_ACTION_DEPTH 3
+
+/**
+ * struct dp_stats_percpu - per-cpu packet processing statistics for a given
+ * datapath.
+ * @n_hit: Number of received packets for which a matching flow was found in
+ * the flow table.
+ * @n_miss: Number of received packets that had no matching flow in the flow
+ * table. The sum of @n_hit and @n_miss is the number of packets that have
+ * been received by the datapath.
+ * @n_lost: Number of received packets that had no matching flow in the flow
+ * table that could not be sent to userspace (normally due to an overflow in
+ * one of the datapath's queues).
+ */
+struct dp_stats_percpu {
+ u64 n_hit;
+ u64 n_missed;
+ u64 n_lost;
+ struct u64_stats_sync sync;
+};
+
+/**
+ * struct datapath - datapath for flow-based packet switching
+ * @rcu: RCU callback head for deferred destruction.
+ * @list_node: Element in global 'dps' list.
+ * @n_flows: Number of flows currently in flow table.
+ * @table: Current flow table. Protected by genl_lock and RCU.
+ * @ports: Map from port number to &struct vport. %OVSP_LOCAL port
+ * always exists, other ports may be %NULL. Protected by RTNL and RCU.
+ * @port_list: List of all ports in @ports in arbitrary order. RTNL required
+ * to iterate or modify.
+ * @stats_percpu: Per-CPU datapath statistics.
+ *
+ * Context: See the comment on locking at the top of datapath.c for additional
+ * locking information.
+ */
+struct datapath {
+ struct rcu_head rcu;
+ struct list_head list_node;
+
+ /* Flow table. */
+ struct flow_table __rcu *table;
+
+ /* Switch ports. */
+ struct vport __rcu *ports[DP_MAX_PORTS];
+ struct list_head port_list;
+
+ /* Stats. */
+ struct dp_stats_percpu __percpu *stats_percpu;
+};
+
+/**
+ * struct ovs_skb_cb - OVS data in skb CB
+ * @flow: The flow associated with this packet. May be %NULL if no flow.
+ */
+struct ovs_skb_cb {
+ struct sw_flow *flow;
+};
+#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
+
+/**
+ * struct dp_upcall - metadata to include with a packet to send to userspace
+ * @cmd: One of %OVS_PACKET_CMD_*.
+ * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull.
+ * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
+ * %OVS_PACKET_ATTR_USERDATA.
+ * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no
+ * packet is sent and the packet is accounted in the datapath's @n_lost
+ * counter.
+ */
+struct dp_upcall_info {
+ u8 cmd;
+ const struct sw_flow_key *key;
+ const struct nlattr *userdata;
+ u32 pid;
+};
+
+extern struct notifier_block ovs_dp_device_notifier;
+extern struct genl_multicast_group ovs_dp_vport_multicast_group;
+
+void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
+void ovs_dp_detach_port(struct vport *);
+int ovs_dp_upcall(struct datapath *, struct sk_buff *,
+ const struct dp_upcall_info *);
+
+const char *ovs_dp_name(const struct datapath *dp);
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
+ u8 cmd);
+
+int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
+#endif /* datapath.h */
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
new file mode 100644
index 000000000000..46736518c453
--- /dev/null
+++ b/net/openvswitch/dp_notify.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/netdevice.h>
+#include <net/genetlink.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+static int dp_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct vport *vport;
+
+ if (ovs_is_internal_dev(dev))
+ vport = ovs_internal_dev_get_vport(dev);
+ else
+ vport = ovs_netdev_get_vport(dev);
+
+ if (!vport)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ if (!ovs_is_internal_dev(dev)) {
+ struct sk_buff *notify;
+
+ notify = ovs_vport_cmd_build_info(vport, 0, 0,
+ OVS_VPORT_CMD_DEL);
+ ovs_dp_detach_port(vport);
+ if (IS_ERR(notify)) {
+ netlink_set_err(init_net.genl_sock, 0,
+ ovs_dp_vport_multicast_group.id,
+ PTR_ERR(notify));
+ break;
+ }
+
+ genlmsg_multicast(notify, 0, ovs_dp_vport_multicast_group.id,
+ GFP_KERNEL);
+ }
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+struct notifier_block ovs_dp_device_notifier = {
+ .notifier_call = dp_device_event
+};
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
new file mode 100644
index 000000000000..fe7f020a843e
--- /dev/null
+++ b/net/openvswitch/flow.c
@@ -0,0 +1,1346 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+static struct kmem_cache *flow_cache;
+
+static int check_header(struct sk_buff *skb, int len)
+{
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+ return 0;
+}
+
+static bool arphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_network_offset(skb) +
+ sizeof(struct arp_eth_header));
+}
+
+static int check_iphdr(struct sk_buff *skb)
+{
+ unsigned int nh_ofs = skb_network_offset(skb);
+ unsigned int ip_len;
+ int err;
+
+ err = check_header(skb, nh_ofs + sizeof(struct iphdr));
+ if (unlikely(err))
+ return err;
+
+ ip_len = ip_hdrlen(skb);
+ if (unlikely(ip_len < sizeof(struct iphdr) ||
+ skb->len < nh_ofs + ip_len))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, nh_ofs + ip_len);
+ return 0;
+}
+
+static bool tcphdr_ok(struct sk_buff *skb)
+{
+ int th_ofs = skb_transport_offset(skb);
+ int tcp_len;
+
+ if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
+ return false;
+
+ tcp_len = tcp_hdrlen(skb);
+ if (unlikely(tcp_len < sizeof(struct tcphdr) ||
+ skb->len < th_ofs + tcp_len))
+ return false;
+
+ return true;
+}
+
+static bool udphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct udphdr));
+}
+
+static bool icmphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct icmphdr));
+}
+
+u64 ovs_flow_used_time(unsigned long flow_jiffies)
+{
+ struct timespec cur_ts;
+ u64 cur_ms, idle_ms;
+
+ ktime_get_ts(&cur_ts);
+ idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
+ cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ts.tv_nsec / NSEC_PER_MSEC;
+
+ return cur_ms - idle_ms;
+}
+
+#define SW_FLOW_KEY_OFFSET(field) \
+ (offsetof(struct sw_flow_key, field) + \
+ FIELD_SIZEOF(struct sw_flow_key, field))
+
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
+ int *key_lenp)
+{
+ unsigned int nh_ofs = skb_network_offset(skb);
+ unsigned int nh_len;
+ int payload_ofs;
+ struct ipv6hdr *nh;
+ uint8_t nexthdr;
+ __be16 frag_off;
+ int err;
+
+ *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
+
+ err = check_header(skb, nh_ofs + sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+ nexthdr = nh->nexthdr;
+ payload_ofs = (u8 *)(nh + 1) - skb->data;
+
+ key->ip.proto = NEXTHDR_NONE;
+ key->ip.tos = ipv6_get_dsfield(nh);
+ key->ip.ttl = nh->hop_limit;
+ key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
+ key->ipv6.addr.src = nh->saddr;
+ key->ipv6.addr.dst = nh->daddr;
+
+ payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
+ if (unlikely(payload_ofs < 0))
+ return -EINVAL;
+
+ if (frag_off) {
+ if (frag_off & htons(~0x7))
+ key->ip.frag = OVS_FRAG_TYPE_LATER;
+ else
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+ }
+
+ nh_len = payload_ofs - nh_ofs;
+ skb_set_transport_header(skb, nh_ofs + nh_len);
+ key->ip.proto = nexthdr;
+ return nh_len;
+}
+
+static bool icmp6hdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct icmp6hdr));
+}
+
+#define TCP_FLAGS_OFFSET 13
+#define TCP_FLAG_MASK 0x3f
+
+void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
+{
+ u8 tcp_flags = 0;
+
+ if (flow->key.eth.type == htons(ETH_P_IP) &&
+ flow->key.ip.proto == IPPROTO_TCP) {
+ u8 *tcp = (u8 *)tcp_hdr(skb);
+ tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
+ }
+
+ spin_lock(&flow->lock);
+ flow->used = jiffies;
+ flow->packet_count++;
+ flow->byte_count += skb->len;
+ flow->tcp_flags |= tcp_flags;
+ spin_unlock(&flow->lock);
+}
+
+struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
+{
+ int actions_len = nla_len(actions);
+ struct sw_flow_actions *sfa;
+
+ /* At least DP_MAX_PORTS actions are required to be able to flood a
+ * packet to every port. Factor of 2 allows for setting VLAN tags,
+ * etc. */
+ if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
+ return ERR_PTR(-EINVAL);
+
+ sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->actions_len = actions_len;
+ memcpy(sfa->actions, nla_data(actions), actions_len);
+ return sfa;
+}
+
+struct sw_flow *ovs_flow_alloc(void)
+{
+ struct sw_flow *flow;
+
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&flow->lock);
+ flow->sf_acts = NULL;
+
+ return flow;
+}
+
+static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
+{
+ hash = jhash_1word(hash, table->hash_seed);
+ return flex_array_get(table->buckets,
+ (hash & (table->n_buckets - 1)));
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head *),
+ n_buckets, GFP_KERNEL);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+struct flow_table *ovs_flow_tbl_alloc(int new_size)
+{
+ struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
+
+ if (!table)
+ return NULL;
+
+ table->buckets = alloc_buckets(new_size);
+
+ if (!table->buckets) {
+ kfree(table);
+ return NULL;
+ }
+ table->n_buckets = new_size;
+ table->count = 0;
+ table->node_ver = 0;
+ table->keep_flows = false;
+ get_random_bytes(&table->hash_seed, sizeof(u32));
+
+ return table;
+}
+
+void ovs_flow_tbl_destroy(struct flow_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ if (table->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < table->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(table->buckets, i);
+ struct hlist_node *node, *n;
+ int ver = table->node_ver;
+
+ hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
+ hlist_del_rcu(&flow->hash_node[ver]);
+ ovs_flow_free(flow);
+ }
+ }
+
+skip_flows:
+ free_buckets(table->buckets);
+ kfree(table);
+}
+
+static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct flow_table *table = container_of(rcu, struct flow_table, rcu);
+
+ ovs_flow_tbl_destroy(table);
+}
+
+void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
+{
+ if (!table)
+ return;
+
+ call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+}
+
+struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ struct hlist_node *n;
+ int ver;
+ int i;
+
+ ver = table->node_ver;
+ while (*bucket < table->n_buckets) {
+ i = 0;
+ head = flex_array_get(table->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ struct hlist_node *n;
+
+ head = flex_array_get(old->buckets, i);
+
+ hlist_for_each_entry(flow, n, head, hash_node[old_ver])
+ ovs_flow_tbl_insert(new, flow);
+ }
+ old->keep_flows = true;
+}
+
+static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
+{
+ struct flow_table *new_table;
+
+ new_table = ovs_flow_tbl_alloc(n_buckets);
+ if (!new_table)
+ return ERR_PTR(-ENOMEM);
+
+ flow_table_copy_flows(table, new_table);
+
+ return new_table;
+}
+
+struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
+{
+ return __flow_tbl_rehash(table, table->n_buckets);
+}
+
+struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
+{
+ return __flow_tbl_rehash(table, table->n_buckets * 2);
+}
+
+void ovs_flow_free(struct sw_flow *flow)
+{
+ if (unlikely(!flow))
+ return;
+
+ kfree((struct sf_flow_acts __force *)flow->sf_acts);
+ kmem_cache_free(flow_cache, flow);
+}
+
+/* RCU callback used by ovs_flow_deferred_free. */
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+
+ ovs_flow_free(flow);
+}
+
+/* Schedules 'flow' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_flow_deferred_free(struct sw_flow *flow)
+{
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+}
+
+/* RCU callback used by ovs_flow_deferred_free_acts. */
+static void rcu_free_acts_callback(struct rcu_head *rcu)
+{
+ struct sw_flow_actions *sf_acts = container_of(rcu,
+ struct sw_flow_actions, rcu);
+ kfree(sf_acts);
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
+{
+ call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ struct qtag_prefix {
+ __be16 eth_type; /* ETH_P_8021Q */
+ __be16 tci;
+ };
+ struct qtag_prefix *qp;
+
+ if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
+ return 0;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
+ sizeof(__be16))))
+ return -ENOMEM;
+
+ qp = (struct qtag_prefix *) skb->data;
+ key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
+ __skb_pull(skb, sizeof(struct qtag_prefix));
+
+ return 0;
+}
+
+static __be16 parse_ethertype(struct sk_buff *skb)
+{
+ struct llc_snap_hdr {
+ u8 dsap; /* Always 0xAA */
+ u8 ssap; /* Always 0xAA */
+ u8 ctrl;
+ u8 oui[3];
+ __be16 ethertype;
+ };
+ struct llc_snap_hdr *llc;
+ __be16 proto;
+
+ proto = *(__be16 *) skb->data;
+ __skb_pull(skb, sizeof(__be16));
+
+ if (ntohs(proto) >= 1536)
+ return proto;
+
+ if (skb->len < sizeof(struct llc_snap_hdr))
+ return htons(ETH_P_802_2);
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
+ return htons(0);
+
+ llc = (struct llc_snap_hdr *) skb->data;
+ if (llc->dsap != LLC_SAP_SNAP ||
+ llc->ssap != LLC_SAP_SNAP ||
+ (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
+ return htons(ETH_P_802_2);
+
+ __skb_pull(skb, sizeof(struct llc_snap_hdr));
+ return llc->ethertype;
+}
+
+static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
+ int *key_lenp, int nh_len)
+{
+ struct icmp6hdr *icmp = icmp6_hdr(skb);
+ int error = 0;
+ int key_len;
+
+ /* The ICMPv6 type and code fields use the 16-bit transport port
+ * fields, so we need to store them in 16-bit network byte order.
+ */
+ key->ipv6.tp.src = htons(icmp->icmp6_type);
+ key->ipv6.tp.dst = htons(icmp->icmp6_code);
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+
+ if (icmp->icmp6_code == 0 &&
+ (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ int icmp_len = skb->len - skb_transport_offset(skb);
+ struct nd_msg *nd;
+ int offset;
+
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
+
+ /* In order to process neighbor discovery options, we need the
+ * entire packet.
+ */
+ if (unlikely(icmp_len < sizeof(*nd)))
+ goto out;
+ if (unlikely(skb_linearize(skb))) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ nd = (struct nd_msg *)skb_transport_header(skb);
+ key->ipv6.nd.target = nd->target;
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
+
+ icmp_len -= sizeof(*nd);
+ offset = 0;
+ while (icmp_len >= 8) {
+ struct nd_opt_hdr *nd_opt =
+ (struct nd_opt_hdr *)(nd->opt + offset);
+ int opt_len = nd_opt->nd_opt_len * 8;
+
+ if (unlikely(!opt_len || opt_len > icmp_len))
+ goto invalid;
+
+ /* Store the link layer address if the appropriate
+ * option is provided. It is considered an error if
+ * the same link layer option is specified twice.
+ */
+ if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
+ && opt_len == 8) {
+ if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
+ goto invalid;
+ memcpy(key->ipv6.nd.sll,
+ &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+ } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
+ && opt_len == 8) {
+ if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
+ goto invalid;
+ memcpy(key->ipv6.nd.tll,
+ &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
+ }
+
+ icmp_len -= opt_len;
+ offset += opt_len;
+ }
+ }
+
+ goto out;
+
+invalid:
+ memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
+ memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
+ memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
+
+out:
+ *key_lenp = key_len;
+ return error;
+}
+
+/**
+ * ovs_flow_extract - extracts a flow key from an Ethernet frame.
+ * @skb: sk_buff that contains the frame, with skb->data pointing to the
+ * Ethernet header
+ * @in_port: port number on which @skb was received.
+ * @key: output flow key
+ * @key_lenp: length of output flow key
+ *
+ * The caller must ensure that skb->len >= ETH_HLEN.
+ *
+ * Returns 0 if successful, otherwise a negative errno value.
+ *
+ * Initializes @skb header pointers as follows:
+ *
+ * - skb->mac_header: the Ethernet header.
+ *
+ * - skb->network_header: just past the Ethernet header, or just past the
+ * VLAN header, to the first byte of the Ethernet payload.
+ *
+ * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
+ * on output, then just past the IP header, if one is present and
+ * of a correct length, otherwise the same as skb->network_header.
+ * For other key->dl_type values it is left untouched.
+ */
+int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
+ int *key_lenp)
+{
+ int error = 0;
+ int key_len = SW_FLOW_KEY_OFFSET(eth);
+ struct ethhdr *eth;
+
+ memset(key, 0, sizeof(*key));
+
+ key->phy.priority = skb->priority;
+ key->phy.in_port = in_port;
+
+ skb_reset_mac_header(skb);
+
+ /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
+ * header in the linear data area.
+ */
+ eth = eth_hdr(skb);
+ memcpy(key->eth.src, eth->h_source, ETH_ALEN);
+ memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
+
+ __skb_pull(skb, 2 * ETH_ALEN);
+
+ if (vlan_tx_tag_present(skb))
+ key->eth.tci = htons(skb->vlan_tci);
+ else if (eth->h_proto == htons(ETH_P_8021Q))
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
+
+ key->eth.type = parse_ethertype(skb);
+ if (unlikely(key->eth.type == htons(0)))
+ return -ENOMEM;
+
+ skb_reset_network_header(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ /* Network layer. */
+ if (key->eth.type == htons(ETH_P_IP)) {
+ struct iphdr *nh;
+ __be16 offset;
+
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
+
+ error = check_iphdr(skb);
+ if (unlikely(error)) {
+ if (error == -EINVAL) {
+ skb->transport_header = skb->network_header;
+ error = 0;
+ }
+ goto out;
+ }
+
+ nh = ip_hdr(skb);
+ key->ipv4.addr.src = nh->saddr;
+ key->ipv4.addr.dst = nh->daddr;
+
+ key->ip.proto = nh->protocol;
+ key->ip.tos = nh->tos;
+ key->ip.ttl = nh->ttl;
+
+ offset = nh->frag_off & htons(IP_OFFSET);
+ if (offset) {
+ key->ip.frag = OVS_FRAG_TYPE_LATER;
+ goto out;
+ }
+ if (nh->frag_off & htons(IP_MF) ||
+ skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
+ /* Transport layer. */
+ if (key->ip.proto == IPPROTO_TCP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ if (tcphdr_ok(skb)) {
+ struct tcphdr *tcp = tcp_hdr(skb);
+ key->ipv4.tp.src = tcp->source;
+ key->ipv4.tp.dst = tcp->dest;
+ }
+ } else if (key->ip.proto == IPPROTO_UDP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ if (udphdr_ok(skb)) {
+ struct udphdr *udp = udp_hdr(skb);
+ key->ipv4.tp.src = udp->source;
+ key->ipv4.tp.dst = udp->dest;
+ }
+ } else if (key->ip.proto == IPPROTO_ICMP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ if (icmphdr_ok(skb)) {
+ struct icmphdr *icmp = icmp_hdr(skb);
+ /* The ICMP type and code fields use the 16-bit
+ * transport port fields, so we need to store
+ * them in 16-bit network byte order. */
+ key->ipv4.tp.src = htons(icmp->type);
+ key->ipv4.tp.dst = htons(icmp->code);
+ }
+ }
+
+ } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
+ struct arp_eth_header *arp;
+
+ arp = (struct arp_eth_header *)skb_network_header(skb);
+
+ if (arp->ar_hrd == htons(ARPHRD_ETHER)
+ && arp->ar_pro == htons(ETH_P_IP)
+ && arp->ar_hln == ETH_ALEN
+ && arp->ar_pln == 4) {
+
+ /* We only match on the lower 8 bits of the opcode. */
+ if (ntohs(arp->ar_op) <= 0xff)
+ key->ip.proto = ntohs(arp->ar_op);
+
+ if (key->ip.proto == ARPOP_REQUEST
+ || key->ip.proto == ARPOP_REPLY) {
+ memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
+ memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
+ memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
+ memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
+ }
+ }
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
+ int nh_len; /* IPv6 Header + Extensions */
+
+ nh_len = parse_ipv6hdr(skb, key, &key_len);
+ if (unlikely(nh_len < 0)) {
+ if (nh_len == -EINVAL)
+ skb->transport_header = skb->network_header;
+ else
+ error = nh_len;
+ goto out;
+ }
+
+ if (key->ip.frag == OVS_FRAG_TYPE_LATER)
+ goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
+ /* Transport layer. */
+ if (key->ip.proto == NEXTHDR_TCP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ if (tcphdr_ok(skb)) {
+ struct tcphdr *tcp = tcp_hdr(skb);
+ key->ipv6.tp.src = tcp->source;
+ key->ipv6.tp.dst = tcp->dest;
+ }
+ } else if (key->ip.proto == NEXTHDR_UDP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ if (udphdr_ok(skb)) {
+ struct udphdr *udp = udp_hdr(skb);
+ key->ipv6.tp.src = udp->source;
+ key->ipv6.tp.dst = udp->dest;
+ }
+ } else if (key->ip.proto == NEXTHDR_ICMP) {
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ if (icmp6hdr_ok(skb)) {
+ error = parse_icmpv6(skb, key, &key_len, nh_len);
+ if (error < 0)
+ goto out;
+ }
+ }
+ }
+
+out:
+ *key_lenp = key_len;
+ return error;
+}
+
+u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
+{
+ return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
+ struct sw_flow_key *key, int key_len)
+{
+ struct sw_flow *flow;
+ struct hlist_node *n;
+ struct hlist_head *head;
+ u32 hash;
+
+ hash = ovs_flow_hash(key, key_len);
+
+ head = find_bucket(table, hash);
+ hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
+
+ if (flow->hash == hash &&
+ !memcmp(&flow->key, key, key_len)) {
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(table, flow->hash);
+ hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+ table->count++;
+}
+
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+{
+ hlist_del_rcu(&flow->hash_node[table->node_ver]);
+ table->count--;
+ BUG_ON(table->count < 0);
+}
+
+/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
+const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = -1,
+ [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
+ [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
+ [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
+ [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
+ [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
+ [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
+ [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
+ [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
+ [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+ [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
+ [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
+ [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
+ [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
+};
+
+static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
+ const struct nlattr *a[], u32 *attrs)
+{
+ const struct ovs_key_icmp *icmp_key;
+ const struct ovs_key_tcp *tcp_key;
+ const struct ovs_key_udp *udp_key;
+
+ switch (swkey->ip.proto) {
+ case IPPROTO_TCP:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ swkey->ipv4.tp.src = tcp_key->tcp_src;
+ swkey->ipv4.tp.dst = tcp_key->tcp_dst;
+ break;
+
+ case IPPROTO_UDP:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ swkey->ipv4.tp.src = udp_key->udp_src;
+ swkey->ipv4.tp.dst = udp_key->udp_dst;
+ break;
+
+ case IPPROTO_ICMP:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
+ icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+ swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
+ swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
+ break;
+ }
+
+ return 0;
+}
+
+static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
+ const struct nlattr *a[], u32 *attrs)
+{
+ const struct ovs_key_icmpv6 *icmpv6_key;
+ const struct ovs_key_tcp *tcp_key;
+ const struct ovs_key_udp *udp_key;
+
+ switch (swkey->ip.proto) {
+ case IPPROTO_TCP:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ swkey->ipv6.tp.src = tcp_key->tcp_src;
+ swkey->ipv6.tp.dst = tcp_key->tcp_dst;
+ break;
+
+ case IPPROTO_UDP:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ swkey->ipv6.tp.src = udp_key->udp_src;
+ swkey->ipv6.tp.dst = udp_key->udp_dst;
+ break;
+
+ case IPPROTO_ICMPV6:
+ if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
+ icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+ swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
+ swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
+
+ if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ const struct ovs_key_nd *nd_key;
+
+ if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_ND);
+
+ *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
+ nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+ memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
+ sizeof(swkey->ipv6.nd.target));
+ memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
+ memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
+ }
+ break;
+ }
+
+ return 0;
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u32 *attrsp)
+{
+ const struct nlattr *nla;
+ u32 attrs;
+ int rem;
+
+ attrs = 0;
+ nla_for_each_nested(nla, attr, rem) {
+ u16 type = nla_type(nla);
+ int expected_len;
+
+ if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
+ return -EINVAL;
+
+ expected_len = ovs_key_lens[type];
+ if (nla_len(nla) != expected_len && expected_len != -1)
+ return -EINVAL;
+
+ attrs |= 1 << type;
+ a[type] = nla;
+ }
+ if (rem)
+ return -EINVAL;
+
+ *attrsp = attrs;
+ return 0;
+}
+
+/**
+ * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
+ * @swkey: receives the extracted flow key.
+ * @key_lenp: number of bytes used in @swkey.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ */
+int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
+ const struct nlattr *attr)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ const struct ovs_key_ethernet *eth_key;
+ int key_len;
+ u32 attrs;
+ int err;
+
+ memset(swkey, 0, sizeof(struct sw_flow_key));
+ key_len = SW_FLOW_KEY_OFFSET(eth);
+
+ err = parse_flow_nlattrs(attr, a, &attrs);
+ if (err)
+ return err;
+
+ /* Metadata attributes. */
+ if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+ swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
+ attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+ }
+ if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+ u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
+ if (in_port >= DP_MAX_PORTS)
+ return -EINVAL;
+ swkey->phy.in_port = in_port;
+ attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+ } else {
+ swkey->phy.in_port = USHRT_MAX;
+ }
+
+ /* Data attributes. */
+ if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
+ return -EINVAL;
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+
+ eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+ memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
+ memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
+
+ if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
+ nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
+ const struct nlattr *encap;
+ __be16 tci;
+
+ if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
+ (1 << OVS_KEY_ATTR_ETHERTYPE) |
+ (1 << OVS_KEY_ATTR_ENCAP)))
+ return -EINVAL;
+
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ if (tci & htons(VLAN_TAG_PRESENT)) {
+ swkey->eth.tci = tci;
+
+ err = parse_flow_nlattrs(encap, a, &attrs);
+ if (err)
+ return err;
+ } else if (!tci) {
+ /* Corner case for truncated 802.1Q header. */
+ if (nla_len(encap))
+ return -EINVAL;
+
+ swkey->eth.type = htons(ETH_P_8021Q);
+ *key_lenp = key_len;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (ntohs(swkey->eth.type) < 1536)
+ return -EINVAL;
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ } else {
+ swkey->eth.type = htons(ETH_P_802_2);
+ }
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
+ return -EINVAL;
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
+ ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
+ if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
+ return -EINVAL;
+ swkey->ip.proto = ipv4_key->ipv4_proto;
+ swkey->ip.tos = ipv4_key->ipv4_tos;
+ swkey->ip.ttl = ipv4_key->ipv4_ttl;
+ swkey->ip.frag = ipv4_key->ipv4_frag;
+ swkey->ipv4.addr.src = ipv4_key->ipv4_src;
+ swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
+
+ if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+ err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
+ if (err)
+ return err;
+ }
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ const struct ovs_key_ipv6 *ipv6_key;
+
+ if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
+ return -EINVAL;
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+
+ key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
+ ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+ if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
+ return -EINVAL;
+ swkey->ipv6.label = ipv6_key->ipv6_label;
+ swkey->ip.proto = ipv6_key->ipv6_proto;
+ swkey->ip.tos = ipv6_key->ipv6_tclass;
+ swkey->ip.ttl = ipv6_key->ipv6_hlimit;
+ swkey->ip.frag = ipv6_key->ipv6_frag;
+ memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
+ sizeof(swkey->ipv6.addr.src));
+ memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
+ sizeof(swkey->ipv6.addr.dst));
+
+ if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+ err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
+ if (err)
+ return err;
+ }
+ } else if (swkey->eth.type == htons(ETH_P_ARP)) {
+ const struct ovs_key_arp *arp_key;
+
+ if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
+ return -EINVAL;
+ attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+
+ key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
+ arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+ swkey->ipv4.addr.src = arp_key->arp_sip;
+ swkey->ipv4.addr.dst = arp_key->arp_tip;
+ if (arp_key->arp_op & htons(0xff00))
+ return -EINVAL;
+ swkey->ip.proto = ntohs(arp_key->arp_op);
+ memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
+ memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
+ }
+
+ if (attrs)
+ return -EINVAL;
+ *key_lenp = key_len;
+
+ return 0;
+}
+
+/**
+ * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
+ * @in_port: receives the extracted input port.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ *
+ * This parses a series of Netlink attributes that form a flow key, which must
+ * take the same form accepted by flow_from_nlattrs(), but only enough of it to
+ * get the metadata, that is, the parts of the flow key that cannot be
+ * extracted from the packet itself.
+ */
+int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
+ const struct nlattr *attr)
+{
+ const struct nlattr *nla;
+ int rem;
+
+ *in_port = USHRT_MAX;
+ *priority = 0;
+
+ nla_for_each_nested(nla, attr, rem) {
+ int type = nla_type(nla);
+
+ if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
+ if (nla_len(nla) != ovs_key_lens[type])
+ return -EINVAL;
+
+ switch (type) {
+ case OVS_KEY_ATTR_PRIORITY:
+ *priority = nla_get_u32(nla);
+ break;
+
+ case OVS_KEY_ATTR_IN_PORT:
+ if (nla_get_u32(nla) >= DP_MAX_PORTS)
+ return -EINVAL;
+ *in_port = nla_get_u32(nla);
+ break;
+ }
+ }
+ }
+ if (rem)
+ return -EINVAL;
+ return 0;
+}
+
+int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
+{
+ struct ovs_key_ethernet *eth_key;
+ struct nlattr *nla, *encap;
+
+ if (swkey->phy.priority)
+ NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority);
+
+ if (swkey->phy.in_port != USHRT_MAX)
+ NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port);
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
+ eth_key = nla_data(nla);
+ memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
+ memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
+
+ if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
+ NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q));
+ NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci);
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.tci)
+ goto unencap;
+ } else {
+ encap = NULL;
+ }
+
+ if (swkey->eth.type == htons(ETH_P_802_2))
+ goto unencap;
+
+ NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type);
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ipv4 *ipv4_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv4_key = nla_data(nla);
+ ipv4_key->ipv4_src = swkey->ipv4.addr.src;
+ ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
+ ipv4_key->ipv4_proto = swkey->ip.proto;
+ ipv4_key->ipv4_tos = swkey->ip.tos;
+ ipv4_key->ipv4_ttl = swkey->ip.ttl;
+ ipv4_key->ipv4_frag = swkey->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ipv6 *ipv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_key = nla_data(nla);
+ memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
+ sizeof(ipv6_key->ipv6_src));
+ memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
+ sizeof(ipv6_key->ipv6_dst));
+ ipv6_key->ipv6_label = swkey->ipv6.label;
+ ipv6_key->ipv6_proto = swkey->ip.proto;
+ ipv6_key->ipv6_tclass = swkey->ip.tos;
+ ipv6_key->ipv6_hlimit = swkey->ip.ttl;
+ ipv6_key->ipv6_frag = swkey->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_ARP)) {
+ struct ovs_key_arp *arp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
+ if (!nla)
+ goto nla_put_failure;
+ arp_key = nla_data(nla);
+ memset(arp_key, 0, sizeof(struct ovs_key_arp));
+ arp_key->arp_sip = swkey->ipv4.addr.src;
+ arp_key->arp_tip = swkey->ipv4.addr.dst;
+ arp_key->arp_op = htons(swkey->ip.proto);
+ memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
+ memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
+ }
+
+ if ((swkey->eth.type == htons(ETH_P_IP) ||
+ swkey->eth.type == htons(ETH_P_IPV6)) &&
+ swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+
+ if (swkey->ip.proto == IPPROTO_TCP) {
+ struct ovs_key_tcp *tcp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
+ if (!nla)
+ goto nla_put_failure;
+ tcp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ tcp_key->tcp_src = swkey->ipv4.tp.src;
+ tcp_key->tcp_dst = swkey->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ tcp_key->tcp_src = swkey->ipv6.tp.src;
+ tcp_key->tcp_dst = swkey->ipv6.tp.dst;
+ }
+ } else if (swkey->ip.proto == IPPROTO_UDP) {
+ struct ovs_key_udp *udp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
+ if (!nla)
+ goto nla_put_failure;
+ udp_key = nla_data(nla);
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ udp_key->udp_src = swkey->ipv4.tp.src;
+ udp_key->udp_dst = swkey->ipv4.tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ udp_key->udp_src = swkey->ipv6.tp.src;
+ udp_key->udp_dst = swkey->ipv6.tp.dst;
+ }
+ } else if (swkey->eth.type == htons(ETH_P_IP) &&
+ swkey->ip.proto == IPPROTO_ICMP) {
+ struct ovs_key_icmp *icmp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmp_key = nla_data(nla);
+ icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
+ icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
+ } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
+ swkey->ip.proto == IPPROTO_ICMPV6) {
+ struct ovs_key_icmpv6 *icmpv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
+ sizeof(*icmpv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmpv6_key = nla_data(nla);
+ icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
+ icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
+
+ if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ struct ovs_key_nd *nd_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
+ if (!nla)
+ goto nla_put_failure;
+ nd_key = nla_data(nla);
+ memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
+ sizeof(nd_key->nd_target));
+ memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
+ memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
+ }
+ }
+ }
+
+unencap:
+ if (encap)
+ nla_nest_end(skb, encap);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int ovs_flow_init(void)
+{
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
+ 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void ovs_flow_exit(void)
+{
+ kmem_cache_destroy(flow_cache);
+}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
new file mode 100644
index 000000000000..2747dc2c4ac1
--- /dev/null
+++ b/net/openvswitch/flow.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_H
+#define FLOW_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+#include <net/inet_ecn.h>
+
+struct sk_buff;
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ u32 actions_len;
+ struct nlattr actions[];
+};
+
+struct sw_flow_key {
+ struct {
+ u32 priority; /* Packet QoS priority. */
+ u16 in_port; /* Input switch port (or USHRT_MAX). */
+ } phy;
+ struct {
+ u8 src[ETH_ALEN]; /* Ethernet source address. */
+ u8 dst[ETH_ALEN]; /* Ethernet destination address. */
+ __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ __be16 type; /* Ethernet frame type. */
+ } eth;
+ struct {
+ u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
+ u8 tos; /* IP ToS. */
+ u8 ttl; /* IP TTL/hop limit. */
+ u8 frag; /* One of OVS_FRAG_TYPE_*. */
+ } ip;
+ union {
+ struct {
+ struct {
+ __be32 src; /* IP source address. */
+ __be32 dst; /* IP destination address. */
+ } addr;
+ union {
+ struct {
+ __be16 src; /* TCP/UDP source port. */
+ __be16 dst; /* TCP/UDP destination port. */
+ } tp;
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
+ };
+ } ipv4;
+ struct {
+ struct {
+ struct in6_addr src; /* IPv6 source address. */
+ struct in6_addr dst; /* IPv6 destination address. */
+ } addr;
+ __be32 label; /* IPv6 flow label. */
+ struct {
+ __be16 src; /* TCP/UDP source port. */
+ __be16 dst; /* TCP/UDP destination port. */
+ } tp;
+ struct {
+ struct in6_addr target; /* ND target address. */
+ u8 sll[ETH_ALEN]; /* ND source link layer address. */
+ u8 tll[ETH_ALEN]; /* ND target link layer address. */
+ } nd;
+ } ipv6;
+ };
+};
+
+struct sw_flow {
+ struct rcu_head rcu;
+ struct hlist_node hash_node[2];
+ u32 hash;
+
+ struct sw_flow_key key;
+ struct sw_flow_actions __rcu *sf_acts;
+
+ spinlock_t lock; /* Lock for values below. */
+ unsigned long used; /* Last used time (in jiffies). */
+ u64 packet_count; /* Number of packets matched. */
+ u64 byte_count; /* Number of bytes matched. */
+ u8 tcp_flags; /* Union of seen TCP flags. */
+};
+
+struct arp_eth_header {
+ __be16 ar_hrd; /* format of hardware address */
+ __be16 ar_pro; /* format of protocol address */
+ unsigned char ar_hln; /* length of hardware address */
+ unsigned char ar_pln; /* length of protocol address */
+ __be16 ar_op; /* ARP opcode (command) */
+
+ /* Ethernet+IPv4 specific members. */
+ unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
+ unsigned char ar_sip[4]; /* sender IP address */
+ unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
+ unsigned char ar_tip[4]; /* target IP address */
+} __packed;
+
+int ovs_flow_init(void);
+void ovs_flow_exit(void);
+
+struct sw_flow *ovs_flow_alloc(void);
+void ovs_flow_deferred_free(struct sw_flow *);
+void ovs_flow_free(struct sw_flow *flow);
+
+struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
+void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
+
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
+ int *key_lenp);
+void ovs_flow_used(struct sw_flow *, struct sk_buff *);
+u64 ovs_flow_used_time(unsigned long flow_jiffies);
+
+/* Upper bound on the length of a nlattr-formatted flow key. The longest
+ * nlattr-formatted flow key would be:
+ *
+ * struct pad nl hdr total
+ * ------ --- ------ -----
+ * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
+ * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
+ * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
+ * OVS_KEY_ATTR_8021Q 4 -- 4 8
+ * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8
+ * OVS_KEY_ATTR_IPV6 40 -- 4 44
+ * OVS_KEY_ATTR_ICMPV6 2 2 4 8
+ * OVS_KEY_ATTR_ND 28 -- 4 32
+ * -------------------------------------------------
+ * total 132
+ */
+#define FLOW_BUFSIZE 132
+
+int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
+int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
+ const struct nlattr *);
+int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
+ const struct nlattr *);
+
+#define TBL_MIN_BUCKETS 1024
+
+struct flow_table {
+ struct flex_array *buckets;
+ unsigned int count, n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
+};
+
+static inline int ovs_flow_tbl_count(struct flow_table *table)
+{
+ return table->count;
+}
+
+static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
+{
+ return (table->count > table->n_buckets);
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
+ struct sw_flow_key *key, int len);
+void ovs_flow_tbl_destroy(struct flow_table *table);
+void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
+struct flow_table *ovs_flow_tbl_alloc(int new_size);
+struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
+struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
+void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow);
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
+
+struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
+extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
+
+#endif /* flow.h */
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
new file mode 100644
index 000000000000..8fc28b86f2b3
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/hardirq.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+struct internal_dev {
+ struct vport *vport;
+};
+
+static struct internal_dev *internal_dev_priv(struct net_device *netdev)
+{
+ return netdev_priv(netdev);
+}
+
+/* This function is only called by the kernel network layer.*/
+static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct vport *vport = ovs_internal_dev_get_vport(netdev);
+ struct ovs_vport_stats vport_stats;
+
+ ovs_vport_get_stats(vport, &vport_stats);
+
+ /* The tx and rx stats need to be swapped because the
+ * switch and host OS have opposite perspectives. */
+ stats->rx_packets = vport_stats.tx_packets;
+ stats->tx_packets = vport_stats.rx_packets;
+ stats->rx_bytes = vport_stats.tx_bytes;
+ stats->tx_bytes = vport_stats.rx_bytes;
+ stats->rx_errors = vport_stats.tx_errors;
+ stats->tx_errors = vport_stats.rx_errors;
+ stats->rx_dropped = vport_stats.tx_dropped;
+ stats->tx_dropped = vport_stats.rx_dropped;
+
+ return stats;
+}
+
+static int internal_dev_mac_addr(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ return 0;
+}
+
+/* Called with rcu_read_lock_bh. */
+static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ rcu_read_lock();
+ ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int internal_dev_open(struct net_device *netdev)
+{
+ netif_start_queue(netdev);
+ return 0;
+}
+
+static int internal_dev_stop(struct net_device *netdev)
+{
+ netif_stop_queue(netdev);
+ return 0;
+}
+
+static void internal_dev_getinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, "openvswitch");
+}
+
+static const struct ethtool_ops internal_dev_ethtool_ops = {
+ .get_drvinfo = internal_dev_getinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
+{
+ if (new_mtu < 68)
+ return -EINVAL;
+
+ netdev->mtu = new_mtu;
+ return 0;
+}
+
+static void internal_dev_destructor(struct net_device *dev)
+{
+ struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+ ovs_vport_free(vport);
+ free_netdev(dev);
+}
+
+static const struct net_device_ops internal_dev_netdev_ops = {
+ .ndo_open = internal_dev_open,
+ .ndo_stop = internal_dev_stop,
+ .ndo_start_xmit = internal_dev_xmit,
+ .ndo_set_mac_address = internal_dev_mac_addr,
+ .ndo_change_mtu = internal_dev_change_mtu,
+ .ndo_get_stats64 = internal_dev_get_stats,
+};
+
+static void do_setup(struct net_device *netdev)
+{
+ ether_setup(netdev);
+
+ netdev->netdev_ops = &internal_dev_netdev_ops;
+
+ netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netdev->destructor = internal_dev_destructor;
+ SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
+ netdev->tx_queue_len = 0;
+
+ netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
+ NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
+
+ netdev->vlan_features = netdev->features;
+ netdev->features |= NETIF_F_HW_VLAN_TX;
+ netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
+ random_ether_addr(netdev->dev_addr);
+}
+
+static struct vport *internal_dev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+ struct netdev_vport *netdev_vport;
+ struct internal_dev *internal_dev;
+ int err;
+
+ vport = ovs_vport_alloc(sizeof(struct netdev_vport),
+ &ovs_internal_vport_ops, parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ goto error;
+ }
+
+ netdev_vport = netdev_vport_priv(vport);
+
+ netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, do_setup);
+ if (!netdev_vport->dev) {
+ err = -ENOMEM;
+ goto error_free_vport;
+ }
+
+ internal_dev = internal_dev_priv(netdev_vport->dev);
+ internal_dev->vport = vport;
+
+ err = register_netdevice(netdev_vport->dev);
+ if (err)
+ goto error_free_netdev;
+
+ dev_set_promiscuity(netdev_vport->dev, 1);
+ netif_start_queue(netdev_vport->dev);
+
+ return vport;
+
+error_free_netdev:
+ free_netdev(netdev_vport->dev);
+error_free_vport:
+ ovs_vport_free(vport);
+error:
+ return ERR_PTR(err);
+}
+
+static void internal_dev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ netif_stop_queue(netdev_vport->dev);
+ dev_set_promiscuity(netdev_vport->dev, -1);
+
+ /* unregister_netdevice() waits for an RCU grace period. */
+ unregister_netdevice(netdev_vport->dev);
+}
+
+static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
+{
+ struct net_device *netdev = netdev_vport_priv(vport)->dev;
+ int len;
+
+ len = skb->len;
+ skb->dev = netdev;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, netdev);
+
+ netif_rx(skb);
+
+ return len;
+}
+
+const struct vport_ops ovs_internal_vport_ops = {
+ .type = OVS_VPORT_TYPE_INTERNAL,
+ .create = internal_dev_create,
+ .destroy = internal_dev_destroy,
+ .get_name = ovs_netdev_get_name,
+ .get_ifindex = ovs_netdev_get_ifindex,
+ .send = internal_dev_recv,
+};
+
+int ovs_is_internal_dev(const struct net_device *netdev)
+{
+ return netdev->netdev_ops == &internal_dev_netdev_ops;
+}
+
+struct vport *ovs_internal_dev_get_vport(struct net_device *netdev)
+{
+ if (!ovs_is_internal_dev(netdev))
+ return NULL;
+
+ return internal_dev_priv(netdev)->vport;
+}
diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h
new file mode 100644
index 000000000000..3454447c5f11
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_INTERNAL_DEV_H
+#define VPORT_INTERNAL_DEV_H 1
+
+#include "datapath.h"
+#include "vport.h"
+
+int ovs_is_internal_dev(const struct net_device *);
+struct vport *ovs_internal_dev_get_vport(struct net_device *);
+
+#endif /* vport-internal_dev.h */
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
new file mode 100644
index 000000000000..c1068aed03d1
--- /dev/null
+++ b/net/openvswitch/vport-netdev.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if_arp.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/llc.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+
+#include <net/llc.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+/* Must be called with rcu_read_lock. */
+static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
+{
+ if (unlikely(!vport)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Make our own copy of the packet. Otherwise we will mangle the
+ * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
+ * (No one comes after us, since we tell handle_bridge() that we took
+ * the packet.) */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return;
+
+ skb_push(skb, ETH_HLEN);
+ ovs_vport_receive(vport, skb);
+}
+
+/* Called with rcu_read_lock and bottom-halves disabled. */
+static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct vport *vport;
+
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
+
+ vport = ovs_netdev_get_vport(skb->dev);
+
+ netdev_port_receive(vport, skb);
+
+ return RX_HANDLER_CONSUMED;
+}
+
+static struct vport *netdev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+ struct netdev_vport *netdev_vport;
+ int err;
+
+ vport = ovs_vport_alloc(sizeof(struct netdev_vport),
+ &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ goto error;
+ }
+
+ netdev_vport = netdev_vport_priv(vport);
+
+ netdev_vport->dev = dev_get_by_name(&init_net, parms->name);
+ if (!netdev_vport->dev) {
+ err = -ENODEV;
+ goto error_free_vport;
+ }
+
+ if (netdev_vport->dev->flags & IFF_LOOPBACK ||
+ netdev_vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(netdev_vport->dev)) {
+ err = -EINVAL;
+ goto error_put;
+ }
+
+ err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ vport);
+ if (err)
+ goto error_put;
+
+ dev_set_promiscuity(netdev_vport->dev, 1);
+ netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+
+ return vport;
+
+error_put:
+ dev_put(netdev_vport->dev);
+error_free_vport:
+ ovs_vport_free(vport);
+error:
+ return ERR_PTR(err);
+}
+
+static void netdev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(netdev_vport->dev);
+ dev_set_promiscuity(netdev_vport->dev, -1);
+
+ synchronize_rcu();
+
+ dev_put(netdev_vport->dev);
+ ovs_vport_free(vport);
+}
+
+const char *ovs_netdev_get_name(const struct vport *vport)
+{
+ const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ return netdev_vport->dev->name;
+}
+
+int ovs_netdev_get_ifindex(const struct vport *vport)
+{
+ const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ return netdev_vport->dev->ifindex;
+}
+
+static unsigned packet_length(const struct sk_buff *skb)
+{
+ unsigned length = skb->len - ETH_HLEN;
+
+ if (skb->protocol == htons(ETH_P_8021Q))
+ length -= VLAN_HLEN;
+
+ return length;
+}
+
+static int netdev_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ int mtu = netdev_vport->dev->mtu;
+ int len;
+
+ if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ if (net_ratelimit())
+ pr_warn("%s: dropped over-mtu packet: %d > %d\n",
+ ovs_dp_name(vport->dp), packet_length(skb), mtu);
+ goto error;
+ }
+
+ if (unlikely(skb_warn_if_lro(skb)))
+ goto error;
+
+ skb->dev = netdev_vport->dev;
+ len = skb->len;
+ dev_queue_xmit(skb);
+
+ return len;
+
+error:
+ kfree_skb(skb);
+ ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
+ return 0;
+}
+
+/* Returns null if this device is not attached to a datapath. */
+struct vport *ovs_netdev_get_vport(struct net_device *dev)
+{
+ if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
+ return (struct vport *)
+ rcu_dereference_rtnl(dev->rx_handler_data);
+ else
+ return NULL;
+}
+
+const struct vport_ops ovs_netdev_vport_ops = {
+ .type = OVS_VPORT_TYPE_NETDEV,
+ .create = netdev_create,
+ .destroy = netdev_destroy,
+ .get_name = ovs_netdev_get_name,
+ .get_ifindex = ovs_netdev_get_ifindex,
+ .send = netdev_send,
+};
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
new file mode 100644
index 000000000000..fd9b008a0e6e
--- /dev/null
+++ b/net/openvswitch/vport-netdev.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_NETDEV_H
+#define VPORT_NETDEV_H 1
+
+#include <linux/netdevice.h>
+
+#include "vport.h"
+
+struct vport *ovs_netdev_get_vport(struct net_device *dev);
+
+struct netdev_vport {
+ struct net_device *dev;
+};
+
+static inline struct netdev_vport *
+netdev_vport_priv(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+const char *ovs_netdev_get_name(const struct vport *);
+const char *ovs_netdev_get_config(const struct vport *);
+int ovs_netdev_get_ifindex(const struct vport *);
+
+#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
new file mode 100644
index 000000000000..6cd760131f15
--- /dev/null
+++ b/net/openvswitch/vport.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/dcache.h>
+#include <linux/etherdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/compat.h>
+#include <linux/version.h>
+
+#include "vport.h"
+#include "vport-internal_dev.h"
+
+/* List of statically compiled vport implementations. Don't forget to also
+ * add yours to the list at the bottom of vport.h. */
+static const struct vport_ops *vport_ops_list[] = {
+ &ovs_netdev_vport_ops,
+ &ovs_internal_vport_ops,
+};
+
+/* Protected by RCU read lock for reading, RTNL lock for writing. */
+static struct hlist_head *dev_table;
+#define VPORT_HASH_BUCKETS 1024
+
+/**
+ * ovs_vport_init - initialize vport subsystem
+ *
+ * Called at module load time to initialize the vport subsystem.
+ */
+int ovs_vport_init(void)
+{
+ dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!dev_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * ovs_vport_exit - shutdown vport subsystem
+ *
+ * Called at module exit time to shutdown the vport subsystem.
+ */
+void ovs_vport_exit(void)
+{
+ kfree(dev_table);
+}
+
+static struct hlist_head *hash_bucket(const char *name)
+{
+ unsigned int hash = full_name_hash(name, strlen(name));
+ return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
+}
+
+/**
+ * ovs_vport_locate - find a port that has already been created
+ *
+ * @name: name of port to find
+ *
+ * Must be called with RTNL or RCU read lock.
+ */
+struct vport *ovs_vport_locate(const char *name)
+{
+ struct hlist_head *bucket = hash_bucket(name);
+ struct vport *vport;
+ struct hlist_node *node;
+
+ hlist_for_each_entry_rcu(vport, node, bucket, hash_node)
+ if (!strcmp(name, vport->ops->get_name(vport)))
+ return vport;
+
+ return NULL;
+}
+
+/**
+ * ovs_vport_alloc - allocate and initialize new vport
+ *
+ * @priv_size: Size of private data area to allocate.
+ * @ops: vport device ops
+ *
+ * Allocate and initialize a new vport defined by @ops. The vport will contain
+ * a private data area of size @priv_size that can be accessed using
+ * vport_priv(). vports that are no longer needed should be released with
+ * vport_free().
+ */
+struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
+ const struct vport_parms *parms)
+{
+ struct vport *vport;
+ size_t alloc_size;
+
+ alloc_size = sizeof(struct vport);
+ if (priv_size) {
+ alloc_size = ALIGN(alloc_size, VPORT_ALIGN);
+ alloc_size += priv_size;
+ }
+
+ vport = kzalloc(alloc_size, GFP_KERNEL);
+ if (!vport)
+ return ERR_PTR(-ENOMEM);
+
+ vport->dp = parms->dp;
+ vport->port_no = parms->port_no;
+ vport->upcall_pid = parms->upcall_pid;
+ vport->ops = ops;
+
+ vport->percpu_stats = alloc_percpu(struct vport_percpu_stats);
+ if (!vport->percpu_stats)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&vport->stats_lock);
+
+ return vport;
+}
+
+/**
+ * ovs_vport_free - uninitialize and free vport
+ *
+ * @vport: vport to free
+ *
+ * Frees a vport allocated with vport_alloc() when it is no longer needed.
+ *
+ * The caller must ensure that an RCU grace period has passed since the last
+ * time @vport was in a datapath.
+ */
+void ovs_vport_free(struct vport *vport)
+{
+ free_percpu(vport->percpu_stats);
+ kfree(vport);
+}
+
+/**
+ * ovs_vport_add - add vport device (for kernel callers)
+ *
+ * @parms: Information about new vport.
+ *
+ * Creates a new vport with the specified configuration (which is dependent on
+ * device type). RTNL lock must be held.
+ */
+struct vport *ovs_vport_add(const struct vport_parms *parms)
+{
+ struct vport *vport;
+ int err = 0;
+ int i;
+
+ ASSERT_RTNL();
+
+ for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) {
+ if (vport_ops_list[i]->type == parms->type) {
+ vport = vport_ops_list[i]->create(parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ goto out;
+ }
+
+ hlist_add_head_rcu(&vport->hash_node,
+ hash_bucket(vport->ops->get_name(vport)));
+ return vport;
+ }
+ }
+
+ err = -EAFNOSUPPORT;
+
+out:
+ return ERR_PTR(err);
+}
+
+/**
+ * ovs_vport_set_options - modify existing vport device (for kernel callers)
+ *
+ * @vport: vport to modify.
+ * @port: New configuration.
+ *
+ * Modifies an existing device with the specified configuration (which is
+ * dependent on device type). RTNL lock must be held.
+ */
+int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
+{
+ ASSERT_RTNL();
+
+ if (!vport->ops->set_options)
+ return -EOPNOTSUPP;
+ return vport->ops->set_options(vport, options);
+}
+
+/**
+ * ovs_vport_del - delete existing vport device
+ *
+ * @vport: vport to delete.
+ *
+ * Detaches @vport from its datapath and destroys it. It is possible to fail
+ * for reasons such as lack of memory. RTNL lock must be held.
+ */
+void ovs_vport_del(struct vport *vport)
+{
+ ASSERT_RTNL();
+
+ hlist_del_rcu(&vport->hash_node);
+
+ vport->ops->destroy(vport);
+}
+
+/**
+ * ovs_vport_get_stats - retrieve device stats
+ *
+ * @vport: vport from which to retrieve the stats
+ * @stats: location to store stats
+ *
+ * Retrieves transmit, receive, and error stats for the given device.
+ *
+ * Must be called with RTNL lock or rcu_read_lock.
+ */
+void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+
+ /* We potentially have 2 sources of stats that need to be combined:
+ * those we have collected (split into err_stats and percpu_stats) from
+ * set_stats() and device error stats from netdev->get_stats() (for
+ * errors that happen downstream and therefore aren't reported through
+ * our vport_record_error() function).
+ * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS).
+ * netdev-stats can be directly read over netlink-ioctl.
+ */
+
+ spin_lock_bh(&vport->stats_lock);
+
+ stats->rx_errors = vport->err_stats.rx_errors;
+ stats->tx_errors = vport->err_stats.tx_errors;
+ stats->tx_dropped = vport->err_stats.tx_dropped;
+ stats->rx_dropped = vport->err_stats.rx_dropped;
+
+ spin_unlock_bh(&vport->stats_lock);
+
+ for_each_possible_cpu(i) {
+ const struct vport_percpu_stats *percpu_stats;
+ struct vport_percpu_stats local_stats;
+ unsigned int start;
+
+ percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
+
+ do {
+ start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
+ local_stats = *percpu_stats;
+ } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
+
+ stats->rx_bytes += local_stats.rx_bytes;
+ stats->rx_packets += local_stats.rx_packets;
+ stats->tx_bytes += local_stats.tx_bytes;
+ stats->tx_packets += local_stats.tx_packets;
+ }
+}
+
+/**
+ * ovs_vport_get_options - retrieve device options
+ *
+ * @vport: vport from which to retrieve the options.
+ * @skb: sk_buff where options should be appended.
+ *
+ * Retrieves the configuration of the given device, appending an
+ * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested
+ * vport-specific attributes to @skb.
+ *
+ * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another
+ * negative error code if a real error occurred. If an error occurs, @skb is
+ * left unmodified.
+ *
+ * Must be called with RTNL lock or rcu_read_lock.
+ */
+int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (vport->ops->get_options) {
+ int err = vport->ops->get_options(vport, skb);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ }
+ }
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/**
+ * ovs_vport_receive - pass up received packet to the datapath for processing
+ *
+ * @vport: vport that received the packet
+ * @skb: skb that was received
+ *
+ * Must be called with rcu_read_lock. The packet cannot be shared and
+ * skb->data should point to the Ethernet header. The caller must have already
+ * called compute_ip_summed() to initialize the checksumming fields.
+ */
+void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
+{
+ struct vport_percpu_stats *stats;
+
+ stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
+
+ u64_stats_update_begin(&stats->sync);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->sync);
+
+ ovs_dp_process_received_packet(vport, skb);
+}
+
+/**
+ * ovs_vport_send - send a packet on a device
+ *
+ * @vport: vport on which to send the packet
+ * @skb: skb to send
+ *
+ * Sends the given packet and returns the length of data sent. Either RTNL
+ * lock or rcu_read_lock must be held.
+ */
+int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+{
+ int sent = vport->ops->send(vport, skb);
+
+ if (likely(sent)) {
+ struct vport_percpu_stats *stats;
+
+ stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
+
+ u64_stats_update_begin(&stats->sync);
+ stats->tx_packets++;
+ stats->tx_bytes += sent;
+ u64_stats_update_end(&stats->sync);
+ }
+ return sent;
+}
+
+/**
+ * ovs_vport_record_error - indicate device error to generic stats layer
+ *
+ * @vport: vport that encountered the error
+ * @err_type: one of enum vport_err_type types to indicate the error type
+ *
+ * If using the vport generic stats layer indicate that an error of the given
+ * type has occured.
+ */
+void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
+{
+ spin_lock(&vport->stats_lock);
+
+ switch (err_type) {
+ case VPORT_E_RX_DROPPED:
+ vport->err_stats.rx_dropped++;
+ break;
+
+ case VPORT_E_RX_ERROR:
+ vport->err_stats.rx_errors++;
+ break;
+
+ case VPORT_E_TX_DROPPED:
+ vport->err_stats.tx_dropped++;
+ break;
+
+ case VPORT_E_TX_ERROR:
+ vport->err_stats.tx_errors++;
+ break;
+ };
+
+ spin_unlock(&vport->stats_lock);
+}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
new file mode 100644
index 000000000000..19609629dabd
--- /dev/null
+++ b/net/openvswitch/vport.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_H
+#define VPORT_H 1
+
+#include <linux/list.h>
+#include <linux/openvswitch.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/u64_stats_sync.h>
+
+#include "datapath.h"
+
+struct vport;
+struct vport_parms;
+
+/* The following definitions are for users of the vport subsytem: */
+
+int ovs_vport_init(void);
+void ovs_vport_exit(void);
+
+struct vport *ovs_vport_add(const struct vport_parms *);
+void ovs_vport_del(struct vport *);
+
+struct vport *ovs_vport_locate(const char *name);
+
+void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
+
+int ovs_vport_set_options(struct vport *, struct nlattr *options);
+int ovs_vport_get_options(const struct vport *, struct sk_buff *);
+
+int ovs_vport_send(struct vport *, struct sk_buff *);
+
+/* The following definitions are for implementers of vport devices: */
+
+struct vport_percpu_stats {
+ u64 rx_bytes;
+ u64 rx_packets;
+ u64 tx_bytes;
+ u64 tx_packets;
+ struct u64_stats_sync sync;
+};
+
+struct vport_err_stats {
+ u64 rx_dropped;
+ u64 rx_errors;
+ u64 tx_dropped;
+ u64 tx_errors;
+};
+
+/**
+ * struct vport - one port within a datapath
+ * @rcu: RCU callback head for deferred destruction.
+ * @port_no: Index into @dp's @ports array.
+ * @dp: Datapath to which this port belongs.
+ * @node: Element in @dp's @port_list.
+ * @upcall_pid: The Netlink port to use for packets received on this port that
+ * miss the flow table.
+ * @hash_node: Element in @dev_table hash table in vport.c.
+ * @ops: Class structure.
+ * @percpu_stats: Points to per-CPU statistics used and maintained by vport
+ * @stats_lock: Protects @err_stats;
+ * @err_stats: Points to error statistics used and maintained by vport
+ */
+struct vport {
+ struct rcu_head rcu;
+ u16 port_no;
+ struct datapath *dp;
+ struct list_head node;
+ u32 upcall_pid;
+
+ struct hlist_node hash_node;
+ const struct vport_ops *ops;
+
+ struct vport_percpu_stats __percpu *percpu_stats;
+
+ spinlock_t stats_lock;
+ struct vport_err_stats err_stats;
+};
+
+/**
+ * struct vport_parms - parameters for creating a new vport
+ *
+ * @name: New vport's name.
+ * @type: New vport's type.
+ * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
+ * none was supplied.
+ * @dp: New vport's datapath.
+ * @port_no: New vport's port number.
+ */
+struct vport_parms {
+ const char *name;
+ enum ovs_vport_type type;
+ struct nlattr *options;
+
+ /* For ovs_vport_alloc(). */
+ struct datapath *dp;
+ u16 port_no;
+ u32 upcall_pid;
+};
+
+/**
+ * struct vport_ops - definition of a type of virtual port
+ *
+ * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
+ * @create: Create a new vport configured as specified. On success returns
+ * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
+ * @destroy: Destroys a vport. Must call vport_free() on the vport but not
+ * before an RCU grace period has elapsed.
+ * @set_options: Modify the configuration of an existing vport. May be %NULL
+ * if modification is not supported.
+ * @get_options: Appends vport-specific attributes for the configuration of an
+ * existing vport to a &struct sk_buff. May be %NULL for a vport that does not
+ * have any configuration.
+ * @get_name: Get the device's name.
+ * @get_config: Get the device's configuration.
+ * @get_ifindex: Get the system interface index associated with the device.
+ * May be null if the device does not have an ifindex.
+ * @send: Send a packet on the device. Returns the length of the packet sent.
+ */
+struct vport_ops {
+ enum ovs_vport_type type;
+
+ /* Called with RTNL lock. */
+ struct vport *(*create)(const struct vport_parms *);
+ void (*destroy)(struct vport *);
+
+ int (*set_options)(struct vport *, struct nlattr *);
+ int (*get_options)(const struct vport *, struct sk_buff *);
+
+ /* Called with rcu_read_lock or RTNL lock. */
+ const char *(*get_name)(const struct vport *);
+ void (*get_config)(const struct vport *, void *);
+ int (*get_ifindex)(const struct vport *);
+
+ int (*send)(struct vport *, struct sk_buff *);
+};
+
+enum vport_err_type {
+ VPORT_E_RX_DROPPED,
+ VPORT_E_RX_ERROR,
+ VPORT_E_TX_DROPPED,
+ VPORT_E_TX_ERROR,
+};
+
+struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
+ const struct vport_parms *);
+void ovs_vport_free(struct vport *);
+
+#define VPORT_ALIGN 8
+
+/**
+ * vport_priv - access private data area of vport
+ *
+ * @vport: vport to access
+ *
+ * If a nonzero size was passed in priv_size of vport_alloc() a private data
+ * area was allocated on creation. This allows that area to be accessed and
+ * used for any purpose needed by the vport implementer.
+ */
+static inline void *vport_priv(const struct vport *vport)
+{
+ return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
+}
+
+/**
+ * vport_from_priv - lookup vport from private data pointer
+ *
+ * @priv: Start of private data area.
+ *
+ * It is sometimes useful to translate from a pointer to the private data
+ * area to the vport, such as in the case where the private data pointer is
+ * the result of a hash table lookup. @priv must point to the start of the
+ * private data area.
+ */
+static inline struct vport *vport_from_priv(const void *priv)
+{
+ return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
+}
+
+void ovs_vport_receive(struct vport *, struct sk_buff *);
+void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
+
+/* List of statically compiled vport implementations. Don't forget to also
+ * add yours to the list at the top of vport.c. */
+extern const struct vport_ops ovs_netdev_vport_ops;
+extern const struct vport_ops ovs_internal_vport_ops;
+
+#endif /* vport.h */
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index ed1336e15920..45326599fda3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -277,7 +277,7 @@ static inline int teql_resolve(struct sk_buff *skb,
return 0;
rcu_read_lock();
- mn = dst_get_neighbour(dst);
+ mn = dst_get_neighbour_noref(dst);
res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
rcu_read_unlock();
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4fce1cec193e..82e803b56952 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1499,7 +1499,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
goto free_dst;
/* Copy neighbour for reachability confirmation */
- dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour(dst)));
+ dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour_noref(dst)));
xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
xfrm_init_pmtu(dst_prev);
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 199616bb68d3..7bd6f138236b 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -114,6 +114,7 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb,
int offset, ret = 0;
struct ipv6hdr *ip6;
u8 nexthdr;
+ __be16 frag_off;
ip6 = ipv6_hdr(skb);
if (ip6 == NULL)
@@ -126,7 +127,7 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb,
offset = skb_network_offset(skb);
offset += sizeof(*ip6);
nexthdr = ip6->nexthdr;
- offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
if (offset < 0)
return 0;
if (proto)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7e6c2564e741..cca09bb46502 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3561,6 +3561,7 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb,
u8 nexthdr;
int ret = -EINVAL, offset;
struct ipv6hdr _ipv6h, *ip6;
+ __be16 frag_off;
offset = skb_network_offset(skb);
ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
@@ -3573,7 +3574,7 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb,
nexthdr = ip6->nexthdr;
offset += sizeof(_ipv6h);
- offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
if (offset < 0)
goto out;