From 0409c9a5a7546043d32e8c46df0dcaf65f97f659 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:43 +0100
Subject: net: fib6: fib6_commit_metrics: fix potential NULL pointer
 dereference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When IPv6 host routes with metrics attached are being added, we fetch
the metrics store from the dst via COW through dst_metrics_write_ptr(),
added through commit e5fd387ad5b3.

One remaining problem here is that we actually call into inet_getpeer()
and may end up allocating/creating a new peer from the kmemcache, which
may fail.

Example trace from perf probe (inet_getpeer:41) where create is 1:

ip 6877 [002] 4221.391591: probe:inet_getpeer: (ffffffff8165e293)
  85e294 inet_getpeer.part.7 (<- kmem_cache_alloc())
  85e578 inet_getpeer
  8eb333 ipv6_cow_metrics
  8f10ff fib6_commit_metrics

Therefore, a check for NULL on the return of dst_metrics_write_ptr()
is necessary here.

Joint work with Florian Westphal.

Fixes: e5fd387ad5b3 ("ipv6: do not overwrite inetpeer metrics prematurely")
Cc: Michal Kubeček <mkubecek@suse.cz>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b2d1838897c9..db4984e13f2f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -633,18 +633,17 @@ static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
 static int fib6_commit_metrics(struct dst_entry *dst,
 			       struct nlattr *mx, int mx_len)
 {
+	bool dst_host = dst->flags & DST_HOST;
 	struct nlattr *nla;
 	int remaining;
 	u32 *mp;
 
-	if (dst->flags & DST_HOST) {
-		mp = dst_metrics_write_ptr(dst);
-	} else {
-		mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
-		if (!mp)
-			return -ENOMEM;
+	mp = dst_host ? dst_metrics_write_ptr(dst) :
+			kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+	if (unlikely(!mp))
+		return -ENOMEM;
+	if (!dst_host)
 		dst_init_metrics(dst, mp, 0);
-	}
 
 	nla_for_each_attr(nla, mx, mx_len, remaining) {
 		int type = nla_type(nla);
-- 
cgit v1.2.3


From e715b6d3a5ef55834778d49224e60e8ccb5bf45f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 5 Jan 2015 23:57:44 +0100
Subject: net: fib6: convert cfg metric to u32 outside of table write lock

Do the nla validation earlier, outside the write lock.

This is needed by followup patch which needs to be able to call
request_module (which can sleep) if needed.

Joint work with Daniel Borkmann.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 10 +++++---
 net/ipv6/ip6_fib.c    | 69 +++++++++++++++++++++++++++------------------------
 net/ipv6/route.c      | 57 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 90 insertions(+), 46 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 8eea35d32a75..20e80fa7bbdd 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -74,6 +74,11 @@ struct fib6_node {
 #define FIB6_SUBTREE(fn)	((fn)->subtree)
 #endif
 
+struct mx6_config {
+	const u32 *mx;
+	DECLARE_BITMAP(mx_valid, RTAX_MAX);
+};
+
 /*
  *	routing information
  *
@@ -291,9 +296,8 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		    void *arg);
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
-	     struct nlattr *mx, int mx_len);
-
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+	     struct nl_info *info, struct mx6_config *mxc);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index db4984e13f2f..03c520a4ebeb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -630,31 +630,35 @@ static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
 	       RTF_GATEWAY;
 }
 
-static int fib6_commit_metrics(struct dst_entry *dst,
-			       struct nlattr *mx, int mx_len)
+static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
 {
-	bool dst_host = dst->flags & DST_HOST;
-	struct nlattr *nla;
-	int remaining;
-	u32 *mp;
+	int i;
 
-	mp = dst_host ? dst_metrics_write_ptr(dst) :
-			kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
-	if (unlikely(!mp))
-		return -ENOMEM;
-	if (!dst_host)
-		dst_init_metrics(dst, mp, 0);
+	for (i = 0; i < RTAX_MAX; i++) {
+		if (test_bit(i, mxc->mx_valid))
+			mp[i] = mxc->mx[i];
+	}
+}
+
+static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
+{
+	if (!mxc->mx)
+		return 0;
 
-	nla_for_each_attr(nla, mx, mx_len, remaining) {
-		int type = nla_type(nla);
+	if (dst->flags & DST_HOST) {
+		u32 *mp = dst_metrics_write_ptr(dst);
 
-		if (type) {
-			if (type > RTAX_MAX)
-				return -EINVAL;
+		if (unlikely(!mp))
+			return -ENOMEM;
 
-			mp[type - 1] = nla_get_u32(nla);
-		}
+		fib6_copy_metrics(mp, mxc);
+	} else {
+		dst_init_metrics(dst, mxc->mx, false);
+
+		/* We've stolen mx now. */
+		mxc->mx = NULL;
 	}
+
 	return 0;
 }
 
@@ -663,7 +667,7 @@ static int fib6_commit_metrics(struct dst_entry *dst,
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-			    struct nl_info *info, struct nlattr *mx, int mx_len)
+			    struct nl_info *info, struct mx6_config *mxc)
 {
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
@@ -772,11 +776,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
-		if (mx) {
-			err = fib6_commit_metrics(&rt->dst, mx, mx_len);
-			if (err)
-				return err;
-		}
+		err = fib6_commit_metrics(&rt->dst, mxc);
+		if (err)
+			return err;
+
 		rt->dst.rt6_next = iter;
 		*ins = rt;
 		rt->rt6i_node = fn;
@@ -796,11 +799,11 @@ add:
 			pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
 			return -ENOENT;
 		}
-		if (mx) {
-			err = fib6_commit_metrics(&rt->dst, mx, mx_len);
-			if (err)
-				return err;
-		}
+
+		err = fib6_commit_metrics(&rt->dst, mxc);
+		if (err)
+			return err;
+
 		*ins = rt;
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
@@ -837,8 +840,8 @@ void fib6_force_start_gc(struct net *net)
  *	with source addr info in sub-trees
  */
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
-	     struct nlattr *mx, int mx_len)
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+	     struct nl_info *info, struct mx6_config *mxc)
 {
 	struct fib6_node *fn, *pn = NULL;
 	int err = -ENOMEM;
@@ -933,7 +936,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, info, mx, mx_len);
+	err = fib6_add_rt2node(fn, rt, info, mxc);
 	if (!err) {
 		fib6_start_gc(info->nl_net, rt);
 		if (!(rt->rt6i_flags & RTF_CACHE))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c91083156edb..454771d20b21 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -853,14 +853,14 @@ EXPORT_SYMBOL(rt6_lookup);
  */
 
 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
-			struct nlattr *mx, int mx_len)
+			struct mx6_config *mxc)
 {
 	int err;
 	struct fib6_table *table;
 
 	table = rt->rt6i_table;
 	write_lock_bh(&table->tb6_lock);
-	err = fib6_add(&table->tb6_root, rt, info, mx, mx_len);
+	err = fib6_add(&table->tb6_root, rt, info, mxc);
 	write_unlock_bh(&table->tb6_lock);
 
 	return err;
@@ -868,10 +868,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 
 int ip6_ins_rt(struct rt6_info *rt)
 {
-	struct nl_info info = {
-		.nl_net = dev_net(rt->dst.dev),
-	};
-	return __ip6_ins_rt(rt, &info, NULL, 0);
+	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
+	struct mx6_config mxc = { .mx = NULL, };
+
+	return __ip6_ins_rt(rt, &info, &mxc);
 }
 
 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
@@ -1470,9 +1470,39 @@ out:
 	return entries > rt_max_size;
 }
 
-/*
- *
- */
+static int ip6_convert_metrics(struct mx6_config *mxc,
+			       const struct fib6_config *cfg)
+{
+	struct nlattr *nla;
+	int remaining;
+	u32 *mp;
+
+	if (cfg->fc_mx == NULL)
+		return 0;
+
+	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+	if (unlikely(!mp))
+		return -ENOMEM;
+
+	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+		int type = nla_type(nla);
+
+		if (type) {
+			if (unlikely(type > RTAX_MAX))
+				goto err;
+
+			mp[type - 1] = nla_get_u32(nla);
+			__set_bit(type - 1, mxc->mx_valid);
+		}
+	}
+
+	mxc->mx = mp;
+
+	return 0;
+ err:
+	kfree(mp);
+	return -EINVAL;
+}
 
 int ip6_route_add(struct fib6_config *cfg)
 {
@@ -1482,6 +1512,7 @@ int ip6_route_add(struct fib6_config *cfg)
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
+	struct mx6_config mxc = { .mx = NULL, };
 	int addr_type;
 
 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
@@ -1677,8 +1708,14 @@ install_route:
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
-	return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len);
+	err = ip6_convert_metrics(&mxc, cfg);
+	if (err)
+		goto out;
+
+	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
 
+	kfree(mxc.mx);
+	return err;
 out:
 	if (dev)
 		dev_put(dev);
-- 
cgit v1.2.3


From 29ba4fffd396f4beefe34d24d7bcd86cb5c3e492 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:45 +0100
Subject: net: tcp: refactor reinitialization of congestion control

We can just move this to an extra function and make the code
a bit more readable, no functional change.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cong.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..38f2f8aa4ceb 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -107,6 +107,18 @@ void tcp_init_congestion_control(struct sock *sk)
 		icsk->icsk_ca_ops->init(sk);
 }
 
+static void tcp_reinit_congestion_control(struct sock *sk,
+					  const struct tcp_congestion_ops *ca)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_cleanup_congestion_control(sk);
+	icsk->icsk_ca_ops = ca;
+
+	if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
+}
+
 /* Manage refcounts on socket close. */
 void tcp_cleanup_congestion_control(struct sock *sk)
 {
@@ -262,21 +274,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 #endif
 	if (!ca)
 		err = -ENOENT;
-
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
 		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
 		err = -EPERM;
-
 	else if (!try_module_get(ca->owner))
 		err = -EBUSY;
-
-	else {
-		tcp_cleanup_congestion_control(sk);
-		icsk->icsk_ca_ops = ca;
-
-		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
-			icsk->icsk_ca_ops->init(sk);
-	}
+	else
+		tcp_reinit_congestion_control(sk, ca);
  out:
 	rcu_read_unlock();
 	return err;
-- 
cgit v1.2.3


From c5c6a8ab45ec0f18733afb4aaade0d4a139d80b3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:46 +0100
Subject: net: tcp: add key management to congestion control

This patch adds necessary infrastructure to the congestion control
framework for later per route congestion control support.

For a per route congestion control possibility, our aim is to store
a unique u32 key identifier into dst metrics, which can then be
mapped into a tcp_congestion_ops struct. We argue that having a
RTAX key entry is the most simple, generic and easy way to manage,
and also keeps the memory footprint of dst entries lower on 64 bit
than with storing a pointer directly, for example. Having a unique
key id also allows for decoupling actual TCP congestion control
module management from the FIB layer, i.e. we don't have to care
about expensive module refcounting inside the FIB at this point.

We first thought of using an IDR store for the realization, which
takes over dynamic assignment of unused key space and also performs
the key to pointer mapping in RCU. While doing so, we stumbled upon
the issue that due to the nature of dynamic key distribution, it
just so happens, arguably in very rare occasions, that excessive
module loads and unloads can lead to a possible reuse of previously
used key space. Thus, previously stale keys in the dst metric are
now being reassigned to a different congestion control algorithm,
which might lead to unexpected behaviour. One way to resolve this
would have been to walk FIBs on the actually rare occasion of a
module unload and reset the metric keys for each FIB in each netns,
but that's just very costly.

Therefore, we argue a better solution is to reuse the unique
congestion control algorithm name member and map that into u32 key
space through jhash. For that, we split the flags attribute (as it
currently uses 2 bits only anyway) into two u32 attributes, flags
and key, so that we can keep the cacheline boundary of 2 cachelines
on x86_64 and cache the precalculated key at registration time for
the fast path. On average we might expect 2 - 4 modules being loaded
worst case perhaps 15, so a key collision possibility is extremely
low, and guaranteed collision-free on LE/BE for all in-tree modules.
Overall this results in much simpler code, and all without the
overhead of an IDR. Due to the deterministic nature, modules can
now be unloaded, the congestion control algorithm for a specific
but unloaded key will fall back to the default one, and on module
reload time it will switch back to the expected algorithm
transparently.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  3 +-
 include/net/tcp.h                  |  9 +++-
 net/ipv4/tcp_cong.c                | 97 +++++++++++++++++++++++++++++++-------
 3 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 848e85cb5c61..5976bdecf58b 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -98,7 +98,8 @@ struct inet_connection_sock {
 	const struct tcp_congestion_ops *icsk_ca_ops;
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
-	__u8			  icsk_ca_state;
+	__u8			  icsk_ca_state:7,
+				  icsk_ca_dst_locked:1;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
 	__u8			  icsk_backoff;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..135b70c9a734 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -787,6 +787,8 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CA_MAX	128
 #define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)
 
+#define TCP_CA_UNSPEC	0
+
 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
@@ -794,7 +796,8 @@ enum tcp_ca_ack_event_flags {
 
 struct tcp_congestion_ops {
 	struct list_head	list;
-	unsigned long flags;
+	u32 key;
+	u32 flags;
 
 	/* initialize private data (optional) */
 	void (*init)(struct sock *sk);
@@ -841,6 +844,10 @@ u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
+u32 tcp_ca_get_key_by_name(const char *name);
+char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 38f2f8aa4ceb..63c29dba68a8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/gfp.h>
+#include <linux/jhash.h>
 #include <net/tcp.h>
 
 static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 	return NULL;
 }
 
+/* Must be called with rcu lock held */
+static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+{
+	const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+	if (!ca && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("tcp_%s", name);
+		rcu_read_lock();
+		ca = tcp_ca_find(name);
+	}
+#endif
+	return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+		if (e->key == key)
+			return e;
+	}
+
+	return NULL;
+}
+
 /*
  * Attach new congestion control algorithm to the list
  * of available options.
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 		return -EINVAL;
 	}
 
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
 	spin_lock(&tcp_cong_list_lock);
-	if (tcp_ca_find(ca->name)) {
-		pr_notice("%s already registered\n", ca->name);
+	if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+		pr_notice("%s already registered or non-unique key\n",
+			  ca->name);
 		ret = -EEXIST;
 	} else {
 		list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 	spin_lock(&tcp_cong_list_lock);
 	list_del_rcu(&ca->list);
 	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module gets removed entirely.
+	 *
+	 * A try_module_get() should fail by now as our module is
+	 * in "going" state since no refs are held anymore and
+	 * module_exit() handler being called.
+	 */
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+	const struct tcp_congestion_ops *ca;
+	u32 key;
+
+	might_sleep();
+
+	rcu_read_lock();
+	ca = __tcp_ca_find_autoload(name);
+	key = ca ? ca->key : TCP_CA_UNSPEC;
+	rcu_read_unlock();
+
+	return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+	const struct tcp_congestion_ops *ca;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(key);
+	if (ca)
+		ret = strncpy(buffer, ca->name,
+			      TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
 /* Assign choice of congestion control. */
 void tcp_assign_congestion_control(struct sock *sk)
 {
@@ -253,25 +326,17 @@ out:
 int tcp_set_congestion_control(struct sock *sk, const char *name)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_congestion_ops *ca;
+	const struct tcp_congestion_ops *ca;
 	int err = 0;
 
-	rcu_read_lock();
-	ca = tcp_ca_find(name);
+	if (icsk->icsk_ca_dst_locked)
+		return -EPERM;
 
-	/* no change asking for existing value */
+	rcu_read_lock();
+	ca = __tcp_ca_find_autoload(name);
+	/* No change asking for existing value */
 	if (ca == icsk->icsk_ca_ops)
 		goto out;
-
-#ifdef CONFIG_MODULES
-	/* not found attempt to autoload module */
-	if (!ca && capable(CAP_NET_ADMIN)) {
-		rcu_read_unlock();
-		request_module("tcp_%s", name);
-		rcu_read_lock();
-		ca = tcp_ca_find(name);
-	}
-#endif
 	if (!ca)
 		err = -ENOENT;
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-- 
cgit v1.2.3


From ea697639992d96da98016b8934e68a73876a2264 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:47 +0100
Subject: net: tcp: add RTAX_CC_ALGO fib handling

This patch adds the minimum necessary for the RTAX_CC_ALGO congestion
control metric to be set up and dumped back to user space.

While the internal representation of RTAX_CC_ALGO is handled as a u32
key, we avoided to expose this implementation detail to user space, thus
instead, we chose the netlink attribute that is being exchanged between
user space to be the actual congestion control algorithm name, similarly
as in the setsockopt(2) API in order to allow for maximum flexibility,
even for 3rd party modules.

It is a bit unfortunate that RTAX_QUICKACK used up a whole RTAX slot as
it should have been stored in RTAX_FEATURES instead, we first thought
about reusing it for the congestion control key, but it brings more
complications and/or confusion than worth it.

Joint work with Florian Westphal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h              |  7 +++++++
 include/uapi/linux/rtnetlink.h |  2 ++
 net/core/rtnetlink.c           | 15 +++++++++++++--
 net/decnet/dn_fib.c            |  3 ++-
 net/decnet/dn_table.c          |  4 +++-
 net/ipv4/fib_semantics.c       | 14 ++++++++++++--
 net/ipv6/route.c               | 17 +++++++++++++++--
 7 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 135b70c9a734..95bb237152e0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -846,7 +846,14 @@ extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
 u32 tcp_ca_get_key_by_name(const char *name);
+#ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
+#else
+static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+	return NULL;
+}
+#endif
 
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9c9b8b4480cd..d81f22d5b390 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -389,6 +389,8 @@ enum {
 #define RTAX_INITRWND RTAX_INITRWND
 	RTAX_QUICKACK,
 #define RTAX_QUICKACK RTAX_QUICKACK
+	RTAX_CC_ALGO,
+#define RTAX_CC_ALGO RTAX_CC_ALGO
 	__RTAX_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index da983d4bac02..6a6cdade1676 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -50,6 +50,7 @@
 #include <net/arp.h>
 #include <net/route.h>
 #include <net/udp.h>
+#include <net/tcp.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/fib_rules.h>
@@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 
 	for (i = 0; i < RTAX_MAX; i++) {
 		if (metrics[i]) {
+			if (i == RTAX_CC_ALGO - 1) {
+				char tmp[TCP_CA_NAME_MAX], *name;
+
+				name = tcp_ca_get_name_by_key(metrics[i], tmp);
+				if (!name)
+					continue;
+				if (nla_put_string(skb, i + 1, name))
+					goto nla_put_failure;
+			} else {
+				if (nla_put_u32(skb, i + 1, metrics[i]))
+					goto nla_put_failure;
+			}
 			valid++;
-			if (nla_put_u32(skb, i+1, metrics[i]))
-				goto nla_put_failure;
 		}
 	}
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index d332aefb0846..df4803437888 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
 			int type = nla_type(attr);
 
 			if (type) {
-				if (type > RTAX_MAX || nla_len(attr) < 4)
+				if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
+				    nla_len(attr) < 4)
 					goto err_inval;
 
 				fi->fib_metrics[type-1] = nla_get_u32(attr);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 86e3807052e9..3f19fcbf126d 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -29,6 +29,7 @@
 #include <linux/route.h> /* RTF_xxx */
 #include <net/neighbour.h>
 #include <net/netlink.h>
+#include <net/tcp.h>
 #include <net/dst.h>
 #include <net/flow.h>
 #include <net/fib_rules.h>
@@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
 			 + nla_total_size(4) /* RTA_TABLE */
 			 + nla_total_size(2) /* RTA_DST */
-			 + nla_total_size(4); /* RTA_PRIORITY */
+			 + nla_total_size(4) /* RTA_PRIORITY */
+			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f99f41bd15b8..d2b7b5521b1b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
 			 + nla_total_size(4) /* RTA_TABLE */
 			 + nla_total_size(4) /* RTA_DST */
 			 + nla_total_size(4) /* RTA_PRIORITY */
-			 + nla_total_size(4); /* RTA_PREFSRC */
+			 + nla_total_size(4) /* RTA_PREFSRC */
+			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 
 	/* space for nested metrics */
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
@@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 
 				if (type > RTAX_MAX)
 					goto err_inval;
-				val = nla_get_u32(nla);
+				if (type == RTAX_CC_ALGO) {
+					char tmp[TCP_CA_NAME_MAX];
+
+					nla_strlcpy(tmp, nla, sizeof(tmp));
+					val = tcp_ca_get_key_by_name(tmp);
+					if (val == TCP_CA_UNSPEC)
+						goto err_inval;
+				} else {
+					val = nla_get_u32(nla);
+				}
 				if (type == RTAX_ADVMSS && val > 65535 - 40)
 					val = 65535 - 40;
 				if (type == RTAX_MTU && val > 65535 - 15)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 454771d20b21..34dcbb59df75 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1488,10 +1488,22 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 		int type = nla_type(nla);
 
 		if (type) {
+			u32 val;
+
 			if (unlikely(type > RTAX_MAX))
 				goto err;
+			if (type == RTAX_CC_ALGO) {
+				char tmp[TCP_CA_NAME_MAX];
+
+				nla_strlcpy(tmp, nla, sizeof(tmp));
+				val = tcp_ca_get_key_by_name(tmp);
+				if (val == TCP_CA_UNSPEC)
+					goto err;
+			} else {
+				val = nla_get_u32(nla);
+			}
 
-			mp[type - 1] = nla_get_u32(nla);
+			mp[type - 1] = val;
 			__set_bit(type - 1, mxc->mx_valid);
 		}
 	}
@@ -2571,7 +2583,8 @@ static inline size_t rt6_nlmsg_size(void)
 	       + nla_total_size(4) /* RTA_OIF */
 	       + nla_total_size(4) /* RTA_PRIORITY */
 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
-	       + nla_total_size(sizeof(struct rta_cacheinfo));
+	       + nla_total_size(sizeof(struct rta_cacheinfo))
+	       + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
 }
 
 static int rt6_fill_node(struct net *net,
-- 
cgit v1.2.3


From 81164413ad096bafe8ad1068f3f095a7dd081d8b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 5 Jan 2015 23:57:48 +0100
Subject: net: tcp: add per route congestion control

This work adds the possibility to define a per route/destination
congestion control algorithm. Generally, this opens up the possibility
for a machine with different links to enforce specific congestion
control algorithms with optimal strategies for each of them based
on their network characteristics, even transparently for a single
application listening on all links.

For our specific use case, this additionally facilitates deployment
of DCTCP, for example, applications can easily serve internal
traffic/dsts in DCTCP and external one with CUBIC. Other scenarios
would also allow for utilizing e.g. long living, low priority
background flows for certain destinations/routes while still being
able for normal traffic to utilize the default congestion control
algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific
property, we argue that a per route/destination setting is the most
natural and flexible.

The administrator can utilize this through ip-route(8) by appending
"congctl [lock] <name>", where <name> denotes the name of a
congestion control algorithm and the optional lock parameter allows
to enforce the given algorithm so that applications in user space
would not be allowed to overwrite that algorithm for that destination.

The dst metric lookups are being done when a dst entry is already
available in order to avoid a costly lookup and still before the
algorithms are being initialized, thus overhead is very low when the
feature is not being used. While the client side would need to drop
the current reference on the module, on server side this can actually
even be avoided as we just got a flat-copied socket clone.

Joint work with Florian Westphal.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  6 ++++++
 net/ipv4/tcp_ipv4.c      |  2 ++
 net/ipv4/tcp_minisocks.c | 30 ++++++++++++++++++++++++++----
 net/ipv4/tcp_output.c    | 21 +++++++++++++++++++++
 net/ipv6/tcp_ipv6.c      |  2 ++
 5 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 95bb237152e0..b8fdc6bab3f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(struct sock *sk,
 				      struct request_sock *req,
 				      struct sk_buff *skb);
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req,
 				  struct dst_entry *dst);
@@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
 	return jiffies_to_usecs(tcp_rto_min(sk));
 }
 
+static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
+{
+	return dst_metric_locked(dst, RTAX_CC_ALGO);
+}
+
 /* Compute the actual receive window we are currently advertising.
  * Rcv_nxt can be after the window if our peer push more data
  * than the offered window.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..ad3e65bdd368 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 	sk_setup_caps(newsk, dst);
 
+	tcp_ca_openreq_child(newsk, dst);
+
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	if (tcp_sk(sk)->rx_opt.user_mss &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..bc9216dc9de1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
 	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
 }
 
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+	bool ca_got_dst = false;
+
+	if (ca_key != TCP_CA_UNSPEC) {
+		const struct tcp_congestion_ops *ca;
+
+		rcu_read_lock();
+		ca = tcp_ca_find_key(ca_key);
+		if (likely(ca && try_module_get(ca->owner))) {
+			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+			icsk->icsk_ca_ops = ca;
+			ca_got_dst = true;
+		}
+		rcu_read_unlock();
+	}
+
+	if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
+		tcp_assign_congestion_control(sk);
+
+	tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
-		if (!try_module_get(newicsk->icsk_ca_ops->owner))
-			tcp_assign_congestion_control(newsk);
-
-		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		__skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262e2326..dc30cb563e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 }
 EXPORT_SYMBOL(tcp_make_synack);
 
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_congestion_ops *ca;
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+	if (ca_key == TCP_CA_UNSPEC)
+		return;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(ca_key);
+	if (likely(ca && try_module_get(ca->owner))) {
+		module_put(icsk->icsk_ca_ops->owner);
+		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+		icsk->icsk_ca_ops = ca;
+	}
+	rcu_read_unlock();
+}
+
 /* Do all connect socket setups that can be done AF independent. */
 static void tcp_connect_init(struct sock *sk)
 {
@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
 	tcp_mtup_init(sk);
 	tcp_sync_mss(sk, dst_mtu(dst));
 
+	tcp_ca_dst_init(sk, dst);
+
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric_advmss(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9c0b54e87b47..5d46832c6f72 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1199,6 +1199,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
 						     newnp->opt->opt_flen);
 
+	tcp_ca_openreq_child(newsk, dst);
+
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	if (tcp_sk(sk)->rx_opt.user_mss &&
-- 
cgit v1.2.3