summaryrefslogtreecommitdiff
path: root/net/ipv4/fib_trie.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2015-04-20 11:32:26 +1000
committerDave Airlie <airlied@redhat.com>2015-04-20 13:05:20 +1000
commit2c33ce009ca2389dbf0535d0672214d09738e35e (patch)
tree6186a6458c3c160385d794a23eaf07c786a9e61b /net/ipv4/fib_trie.c
parentcec32a47010647e8b0603726ebb75b990a4057a4 (diff)
parent09d51602cf84a1264946711dd4ea0dddbac599a1 (diff)
Merge Linus master into drm-next
The merge is clean, but the arm build fails afterwards, due to API changes in the regulator tree. I've included the patch into the merge to fix the build. Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'net/ipv4/fib_trie.c')
-rw-r--r--net/ipv4/fib_trie.c1767
1 files changed, 953 insertions, 814 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3daf0224ff2e..e13fcc602da2 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -79,6 +79,7 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/switchdev.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
@@ -88,38 +89,35 @@
typedef unsigned int t_key;
-#define IS_TNODE(n) ((n)->bits)
-#define IS_LEAF(n) (!(n)->bits)
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
-
-struct tnode {
+struct key_vector {
t_key key;
- unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
union {
- /* The fields in this struct are valid if bits > 0 (TNODE) */
- struct {
- t_key empty_children; /* KEYLENGTH bits needed */
- t_key full_children; /* KEYLENGTH bits needed */
- struct tnode __rcu *child[0];
- };
- /* This list pointer if valid if bits == 0 (LEAF) */
- struct hlist_head list;
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ struct key_vector __rcu *tnode[0];
};
};
-struct leaf_info {
- struct hlist_node hlist;
- int plen;
- u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
- struct list_head falh;
+struct tnode {
struct rcu_head rcu;
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
};
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -142,13 +140,13 @@ struct trie_stat {
};
struct trie {
- struct tnode __rcu *trie;
+ struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
-static void resize(struct trie *t, struct tnode *tn);
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static size_t tnode_free_size;
/*
@@ -161,41 +159,46 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+ return container_of(kv, struct tnode, kv[0]);
+}
+
/* caller must hold RTNL */
-#define node_parent(n) rtnl_dereference((n)->parent)
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
/* caller must hold RCU read lock or RTNL */
-#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
/* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct tnode *n, struct tnode *tp)
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
if (n)
- rcu_assign_pointer(n->parent, tp);
+ rcu_assign_pointer(tn_info(n)->parent, tp);
}
-#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
/* This provides us with the number of children in this node, in the case of a
* leaf this will return 0 meaning none of the children are accessible.
*/
-static inline unsigned long tnode_child_length(const struct tnode *tn)
+static inline unsigned long child_length(const struct key_vector *tn)
{
return (1ul << tn->bits) & ~(1ul);
}
-/* caller must hold RTNL */
-static inline struct tnode *tnode_get_child(const struct tnode *tn,
- unsigned long i)
-{
- return rtnl_dereference(tn->child[i]);
-}
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
-/* caller must hold RCU read lock or RTNL */
-static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
- unsigned long i)
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
- return rcu_dereference_rtnl(tn->child[i]);
+ unsigned long index = key ^ kv->key;
+
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
+
+ return index >> kv->pos;
}
/* To understand this stuff, an understanding of keys and all their bits is
@@ -274,106 +277,104 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
}
#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
+ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
static void __node_free_rcu(struct rcu_head *head)
{
struct tnode *n = container_of(head, struct tnode, rcu);
- if (IS_LEAF(n))
+ if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->bits <= TNODE_KMALLOC_MAX)
+ else if (n->tn_bits <= TNODE_KMALLOC_MAX)
kfree(n);
else
vfree(n);
}
-#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
-static inline void free_leaf_info(struct leaf_info *leaf)
+static struct tnode *tnode_alloc(int bits)
{
- kfree_rcu(leaf, rcu);
-}
+ size_t size;
+
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
-static struct tnode *tnode_alloc(size_t size)
-{
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
}
-static inline void empty_child_inc(struct tnode *n)
+static inline void empty_child_inc(struct key_vector *n)
{
- ++n->empty_children ? : ++n->full_children;
+ ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
}
-static inline void empty_child_dec(struct tnode *n)
+static inline void empty_child_dec(struct key_vector *n)
{
- n->empty_children-- ? : n->full_children--;
+ tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
}
-static struct tnode *leaf_new(t_key key)
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
- if (l) {
- l->parent = NULL;
- /* set key and pos to reflect full key value
- * any trailing zeros in the key should be ignored
- * as the nodes are searched
- */
- l->key = key;
- l->slen = 0;
- l->pos = 0;
- /* set bits to 0 indicating we are not a tnode */
- l->bits = 0;
+ struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct key_vector *l = kv->kv;
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
-}
+ if (!kv)
+ return NULL;
-static struct leaf_info *leaf_info_new(int plen)
-{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- li->mask_plen = ntohl(inet_make_mask(plen));
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ /* initialize key vector */
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
+
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
+
+ return l;
}
-static struct tnode *tnode_new(t_key key, int pos, int bits)
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = offsetof(struct tnode, child[1ul << bits]);
- struct tnode *tn = tnode_alloc(sz);
+ struct tnode *tnode = tnode_alloc(bits);
unsigned int shift = pos + bits;
+ struct key_vector *tn = tnode->kv;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
- if (tn) {
- tn->parent = NULL;
- tn->slen = pos;
- tn->pos = pos;
- tn->bits = bits;
- tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
- if (bits == KEYLENGTH)
- tn->full_children = 1;
- else
- tn->empty_children = 1ul << bits;
- }
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
+ if (!tnode)
+ return NULL;
+
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
- pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct tnode *) << bits);
return tn;
}
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
@@ -381,17 +382,18 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
/* Add a child at position i overwriting the old value.
* Update the value of full_children and empty_children.
*/
-static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
{
- struct tnode *chi = tnode_get_child(tn, i);
+ struct key_vector *chi = get_child(tn, i);
int isfull, wasfull;
- BUG_ON(i >= tnode_child_length(tn));
+ BUG_ON(i >= child_length(tn));
/* update emptyChildren, overflow into fullChildren */
- if (n == NULL && chi != NULL)
+ if (!n && chi)
empty_child_inc(tn);
- if (n != NULL && chi == NULL)
+ if (n && !chi)
empty_child_dec(tn);
/* update fullChildren */
@@ -399,23 +401,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
- tn->full_children--;
+ tn_info(tn)->full_children--;
else if (!wasfull && isfull)
- tn->full_children++;
+ tn_info(tn)->full_children++;
if (n && (tn->slen < n->slen))
tn->slen = n->slen;
- rcu_assign_pointer(tn->child[i], n);
+ rcu_assign_pointer(tn->tnode[i], n);
}
-static void update_children(struct tnode *tn)
+static void update_children(struct key_vector *tn)
{
unsigned long i;
/* update all of the child parent pointers */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
if (!inode)
continue;
@@ -431,36 +433,37 @@ static void update_children(struct tnode *tn)
}
}
-static inline void put_child_root(struct tnode *tp, struct trie *t,
- t_key key, struct tnode *n)
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
{
- if (tp)
- put_child(tp, get_index(key, tp), n);
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
else
- rcu_assign_pointer(t->trie, n);
+ put_child(tp, get_index(key, tp), n);
}
-static inline void tnode_free_init(struct tnode *tn)
+static inline void tnode_free_init(struct key_vector *tn)
{
- tn->rcu.next = NULL;
+ tn_info(tn)->rcu.next = NULL;
}
-static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
{
- n->rcu.next = tn->rcu.next;
- tn->rcu.next = &n->rcu;
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
}
-static void tnode_free(struct tnode *tn)
+static void tnode_free(struct key_vector *tn)
{
- struct callback_head *head = &tn->rcu;
+ struct callback_head *head = &tn_info(tn)->rcu;
while (head) {
head = head->next;
- tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
node_free(tn);
- tn = container_of(head, struct tnode, rcu);
+ tn = container_of(head, struct tnode, rcu)->kv;
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -469,14 +472,16 @@ static void tnode_free(struct tnode *tn)
}
}
-static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
{
- struct tnode *tp = node_parent(oldtnode);
+ struct key_vector *tp = node_parent(oldtnode);
unsigned long i;
/* setup the parent pointer out of and back into this node */
NODE_INIT_PARENT(tn, tp);
- put_child_root(tp, t, tn->key, tn);
+ put_child_root(tp, tn->key, tn);
/* update all of the child parent pointers */
update_children(tn);
@@ -485,18 +490,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
tnode_free(oldtnode);
/* resize children now that oldtnode is freed */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
/* resize child node */
if (tnode_full(tn, inode))
- resize(t, inode);
+ tn = resize(t, inode);
}
+
+ return tp;
}
-static int inflate(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
t_key m;
@@ -504,7 +512,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -514,13 +522,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
- struct tnode *inode = tnode_get_child(oldtnode, --i);
- struct tnode *node0, *node1;
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
unsigned long j, k;
/* An empty child */
- if (inode == NULL)
+ if (!inode)
continue;
/* A leaf or an internal node with skipped bits */
@@ -534,8 +542,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
/* An internal node with two children */
if (inode->bits == 1) {
- put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
- put_child(tn, 2 * i, tnode_get_child(inode, 0));
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
continue;
}
@@ -564,11 +572,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tnode_free_append(tn, node0);
/* populate child pointers in new nodes */
- for (k = tnode_child_length(inode), j = k / 2; j;) {
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
}
/* link new nodes to parent */
@@ -581,25 +589,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
- return -ENOMEM;
+notnode:
+ return NULL;
}
-static int halve(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -609,10 +617,10 @@ static int halve(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode); i;) {
- struct tnode *node1 = tnode_get_child(oldtnode, --i);
- struct tnode *node0 = tnode_get_child(oldtnode, --i);
- struct tnode *inode;
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
/* At least one of the children is empty */
if (!node1 || !node0) {
@@ -622,10 +630,8 @@ static int halve(struct trie *t, struct tnode *oldtnode)
/* Two nonempty children */
inode = tnode_new(node0->key, oldtnode->pos, 1);
- if (!inode) {
- tnode_free(tn);
- return -ENOMEM;
- }
+ if (!inode)
+ goto nomem;
tnode_free_append(tn, inode);
/* initialize pointers out of node */
@@ -638,30 +644,36 @@ static int halve(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
}
-static void collapse(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *n, *tp;
+ struct key_vector *n, *tp;
unsigned long i;
/* scan the tnode looking for that one child that might still exist */
- for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
- n = tnode_get_child(oldtnode, --i);
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
/* compress one level */
tp = node_parent(oldtnode);
- put_child_root(tp, t, oldtnode->key, n);
+ put_child_root(tp, oldtnode->key, n);
node_set_parent(n, tp);
/* drop dead node */
node_free(oldtnode);
+
+ return tp;
}
-static unsigned char update_suffix(struct tnode *tn)
+static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
@@ -671,8 +683,8 @@ static unsigned char update_suffix(struct tnode *tn)
* why we start with a stride of 2 since a stride of 1 would
* represent the nodes with suffix length equal to tn->pos
*/
- for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
- struct tnode *n = tnode_get_child(tn, i);
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
if (!n || (n->slen <= slen))
continue;
@@ -704,12 +716,12 @@ static unsigned char update_suffix(struct tnode *tn)
*
* 'high' in this instance is the variable 'inflate_threshold'. It
* is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
+ * child_length() and instead of multiplying by 2 (since the
* child array will be doubled by inflate()) and multiplying
* the left-hand side by 100 (to handle the percentage thing) we
* multiply the left-hand side by 50.
*
- * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * The left-hand side may look a bit weird: child_length(tn)
* - tn->empty_children is of course the number of non-null children
* in the current node. tn->full_children is the number of "full"
* children, that is non-null tnodes with a skip value of 0.
@@ -719,10 +731,10 @@ static unsigned char update_suffix(struct tnode *tn)
* A clearer way to write this would be:
*
* to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
* tn->full_children;
*
- * new_child_length = tnode_child_length(tn) * 2;
+ * new_child_length = child_length(tn) * 2;
*
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
* new_child_length;
@@ -739,57 +751,57 @@ static unsigned char update_suffix(struct tnode *tn)
* inflate_threshold * new_child_length
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
+ * inflate_threshold * child_length(tn) * 2
*
* shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
+ * 50 * (tn->full_children + child_length(tn) -
* tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
+ * child_length(tn)
*
*/
-static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? inflate_threshold : inflate_threshold_root;
- used -= tn->empty_children;
- used += tn->full_children;
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
/* if bits == KEYLENGTH then pos = 0, and will fail below */
return (used > 1) && tn->pos && ((50 * used) >= threshold);
}
-static bool should_halve(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? halve_threshold : halve_threshold_root;
- used -= tn->empty_children;
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}
-static bool should_collapse(const struct tnode *tn)
+static inline bool should_collapse(struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
- used -= tn->empty_children;
+ used -= tn_info(tn)->empty_children;
/* account for bits == KEYLENGTH case */
- if ((tn->bits == KEYLENGTH) && tn->full_children)
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
used -= KEY_MAX;
/* One child or none, time to drop us from the trie */
@@ -797,10 +809,13 @@ static bool should_collapse(const struct tnode *tn)
}
#define MAX_WORK 10
-static void resize(struct trie *t, struct tnode *tn)
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp = node_parent(tn);
- struct tnode __rcu **cptr;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
int max_work = MAX_WORK;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
@@ -810,183 +825,128 @@ static void resize(struct trie *t, struct tnode *tn)
* doing it ourselves. This way we can let RCU fully do its
* thing without us interfering
*/
- cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
- BUG_ON(tn != rtnl_dereference(*cptr));
+ BUG_ON(tn != get_child(tp, cindex));
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
while (should_inflate(tp, tn) && max_work) {
- if (inflate(t, tn)) {
+ tp = inflate(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
+ /* update parent in case inflate failed */
+ tp = node_parent(tn);
+
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return;
+ return tp;
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
while (should_halve(tp, tn) && max_work) {
- if (halve(t, tn)) {
+ tp = halve(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
/* Only one child remains */
- if (should_collapse(tn)) {
- collapse(t, tn);
- return;
- }
+ if (should_collapse(tn))
+ return collapse(t, tn);
+
+ /* update parent in case halve failed */
+ tp = node_parent(tn);
/* Return if at least one deflate was run */
if (max_work != MAX_WORK)
- return;
+ return tp;
/* push the suffix length to the parent node */
if (tn->slen > tn->pos) {
unsigned char slen = update_suffix(tn);
- if (tp && (slen > tp->slen))
+ if (slen > tp->slen)
tp->slen = slen;
}
-}
-
-/* readside must use rcu_read_lock currently dump routines
- via get_fa_head and dump */
-
-static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
-{
- struct hlist_head *head = &l->list;
- struct leaf_info *li;
-
- hlist_for_each_entry_rcu(li, head, hlist)
- if (li->plen == plen)
- return li;
-
- return NULL;
-}
-
-static inline struct list_head *get_fa_head(struct tnode *l, int plen)
-{
- struct leaf_info *li = find_leaf_info(l, plen);
-
- if (!li)
- return NULL;
- return &li->falh;
+ return tp;
}
-static void leaf_pull_suffix(struct tnode *l)
+static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
{
- struct tnode *tp = node_parent(l);
-
- while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
if (update_suffix(tp) > l->slen)
break;
tp = node_parent(tp);
}
}
-static void leaf_push_suffix(struct tnode *l)
+static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
{
- struct tnode *tn = node_parent(l);
-
/* if this is a new leaf then tn will be NULL and we can sort
* out parent suffix lengths as a part of trie_rebalance
*/
- while (tn && (tn->slen < l->slen)) {
+ while (tn->slen < l->slen) {
tn->slen = l->slen;
tn = node_parent(tn);
}
}
-static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
-{
- /* record the location of the previous list_info entry */
- struct hlist_node **pprev = old->hlist.pprev;
- struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
-
- /* remove the leaf info from the list */
- hlist_del_rcu(&old->hlist);
-
- /* only access li if it is pointing at the last valid hlist_node */
- if (hlist_empty(&l->list) || (*pprev))
- return;
-
- /* update the trie with the latest suffix length */
- l->slen = KEYLENGTH - li->plen;
- leaf_pull_suffix(l);
-}
-
-static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
{
- struct hlist_head *head = &l->list;
- struct leaf_info *li = NULL, *last = NULL;
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&new->hlist, head);
- } else {
- hlist_for_each_entry(li, head, hlist) {
- if (new->plen > li->plen)
- break;
-
- last = li;
- }
- if (last)
- hlist_add_behind_rcu(&new->hlist, &last->hlist);
- else
- hlist_add_before_rcu(&new->hlist, &li->hlist);
- }
-
- /* if we added to the tail node then we need to update slen */
- if (l->slen < (KEYLENGTH - new->plen)) {
- l->slen = KEYLENGTH - new->plen;
- leaf_push_suffix(l);
- }
-}
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
-/* rcu_read_lock needs to be hold by caller from readside */
-static struct tnode *fib_find_node(struct trie *t, u32 key)
-{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ if (!n)
+ break;
- while (n) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
- return NULL;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n))
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
break;
+ }
- n = tnode_get_child_rcu(n, index);
- }
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
+
+ *tp = pn;
return n;
}
@@ -994,14 +954,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
*/
-static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ u8 tos, u32 prio, u32 tb_id)
{
struct fib_alias *fa;
if (!fah)
return NULL;
- list_for_each_entry(fa, fah, fa_list) {
+ hlist_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
+ break;
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1011,77 +980,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-static void trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp;
-
- while ((tp = node_parent(tn)) != NULL) {
- resize(t, tn);
- tn = tp;
- }
-
- /* Handle last (top) tnode */
- if (IS_TNODE(tn))
- resize(t, tn);
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
}
-/* only used from updater-side */
-
-static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
{
- struct list_head *fa_head = NULL;
- struct tnode *l, *n, *tp = NULL;
- struct leaf_info *li;
-
- li = leaf_info_new(plen);
- if (!li)
- return NULL;
- fa_head = &li->falh;
+ struct key_vector *n, *l;
- n = rtnl_dereference(t->trie);
-
- /* If we point to NULL, stop. Either the tree is empty and we should
- * just put a new leaf in if, or we have reached an empty child slot,
- * and we should just put our new leaf in that.
- *
- * If we hit a node with a key that does't match then we should stop
- * and create a new tnode to replace that node and insert ourselves
- * and the other node into the new tnode.
- */
- while (n) {
- unsigned long index = get_index(key, n);
-
- /* This bit of code is a bit tricky but it combines multiple
- * checks into a single check. The prefix consists of the
- * prefix plus zeros for the "bits" in the prefix. The index
- * is the difference between the key and this value. From
- * this we can actually derive several pieces of data.
- * if !(index >> bits)
- * we know the value is child index
- * else
- * we have a mismatch in skip bits and failed
- */
- if (index >> n->bits)
- break;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n)) {
- /* Case 1: n is a leaf, and prefixes match*/
- insert_leaf_info(n, li);
- return fa_head;
- }
-
- tp = n;
- n = tnode_get_child_rcu(n, index);
- }
-
- l = leaf_new(key);
- if (!l) {
- free_leaf_info(li);
- return NULL;
- }
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
- insert_leaf_info(l, li);
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
@@ -1090,21 +1005,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* leaves us in position for handling as case 3
*/
if (n) {
- struct tnode *tn;
+ struct key_vector *tn;
tn = tnode_new(key, __fls(key ^ n->key), 1);
- if (!tn) {
- free_leaf_info(li);
- node_free(l);
- return NULL;
- }
+ if (!tn)
+ goto notnode;
/* initialize routes out of node */
NODE_INIT_PARENT(tn, tp);
put_child(tn, get_index(key, tn) ^ 1, n);
/* start adding routes into the node */
- put_child_root(tp, t, key, tn);
+ put_child_root(tp, key, tn);
node_set_parent(n, tn);
/* parent now has a NULL spot where the leaf can go */
@@ -1112,69 +1024,93 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
}
/* Case 3: n is NULL, and will just insert a new leaf */
- if (tp) {
- NODE_INIT_PARENT(l, tp);
- put_child(tp, get_index(key, tp), l);
- trie_rebalance(t, tp);
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else {
- rcu_assign_pointer(t->trie, l);
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
+ }
+
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
}
- return fa_head;
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ leaf_push_suffix(tp, l);
+ }
+
+ return 0;
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
- struct list_head *fa_head = NULL;
+ struct key_vector *l, *tp;
struct fib_info *fi;
- int plen = cfg->fc_dst_len;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos;
- u32 key, mask;
+ u32 key;
int err;
- struct tnode *l;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl(inet_make_mask(plen));
-
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
-
fi = fib_create_info(cfg);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
- l = fib_find_node(t, key);
- fa = NULL;
-
- if (l) {
- fa_head = get_fa_head(l, plen);
- fa = fib_find_alias(fa_head, tos, fi->fib_priority);
- }
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ tb->tb_id) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
*/
if (fa && fa->fa_tos == tos &&
@@ -1192,9 +1128,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
*/
fa_match = NULL;
fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
- if (fa->fa_tos != tos)
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1217,7 +1154,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
+ if (!new_fa)
goto out;
fi_drop = fa->fa_info;
@@ -1226,8 +1163,21 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+
+ err = netdev_switch_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1254,37 +1204,42 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
+ if (!new_fa)
goto out;
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
- /*
- * Insert new entry to the list.
- */
-
- if (!fa_head) {
- fa_head = fib_insert_node(t, key, plen);
- if (unlikely(!fa_head)) {
- err = -ENOMEM;
- goto out_free_new_fa;
- }
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+
+ /* (Optionally) offload fib entry to switch hardware. */
+ err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ goto out_free_new_fa;
}
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_sw_fib_del;
+
if (!plen)
tb->tb_num_default++;
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
-
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
return 0;
+out_sw_fib_del:
+ netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1293,7 +1248,7 @@ err:
return err;
}
-static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
{
t_key prefix = n->key;
@@ -1304,16 +1259,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
- struct trie *t = (struct trie *)tb->tb_data;
+ struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
const t_key key = ntohl(flp->daddr);
- struct tnode *n, *pn;
- struct leaf_info *li;
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
t_key cindex;
- n = rcu_dereference(t->trie);
+ pn = t->kv;
+ cindex = 0;
+
+ n = get_child_rcu(pn, cindex);
if (!n)
return -EAGAIN;
@@ -1321,24 +1280,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
this_cpu_inc(stats->gets);
#endif
- pn = n;
- cindex = 0;
-
/* Step 1: Travel to the longest prefix match in the trie */
for (;;) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
+ if (index >= (1ul << n->bits))
break;
/* we have found a leaf. Prefixes have already been compared */
@@ -1353,7 +1313,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
cindex = index;
}
- n = tnode_get_child_rcu(n, index);
+ n = get_child_rcu(n, index);
if (unlikely(!n))
goto backtrace;
}
@@ -1361,7 +1321,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 2: Sort out leaves and begin backtracing for longest prefix */
for (;;) {
/* record the pointer where our next node pointer is stored */
- struct tnode __rcu **cptr = n->child;
+ struct key_vector __rcu **cptr = n->tnode;
/* This test verifies that none of the bits that differ
* between the key and the prefix exist in the region of
@@ -1393,13 +1353,17 @@ backtrace:
while (!cindex) {
t_key pkey = pn->key;
- pn = node_parent_rcu(pn);
- if (unlikely(!pn))
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn))
return -EAGAIN;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
/* Get Child's index */
+ pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn);
}
@@ -1407,138 +1371,134 @@ backtrace:
cindex &= cindex - 1;
/* grab pointer for next child node */
- cptr = &pn->child[cindex];
+ cptr = &pn->tnode[cindex];
}
}
found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
+
/* Step 3: Process the leaf, if that fails fall back to backtracing */
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
- if ((key ^ n->key) & li->mask_plen)
+ if ((index >= (1ul << fa->fa_slen)) &&
+ ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
continue;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- struct fib_info *fi = fa->fa_info;
- int nhsel, err;
-
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
- continue;
- if (fi->fib_dead)
- continue;
- if (fa->fa_info->fib_scope < flp->flowi4_scope)
- continue;
- fib_alias_accessed(fa);
- err = fib_props[fa->fa_type].error;
- if (unlikely(err < 0)) {
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- return err;
- }
- if (fi->fib_flags & RTNH_F_DEAD)
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
- continue;
-
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&fi->fib_clntref);
-
- res->prefixlen = li->plen;
- res->nh_sel = nhsel;
- res->type = fa->fa_type;
- res->scope = fi->fib_scope;
- res->fi = fi;
- res->table = tb;
- res->fa_head = &li->falh;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- return err;
- }
+ return err;
}
-
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_miss);
+ this_cpu_inc(stats->semantic_match_miss);
#endif
- }
goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);
-/*
- * Remove the leaf and return parent.
- */
-static void trie_leaf_remove(struct trie *t, struct tnode *l)
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
{
- struct tnode *tp = node_parent(l);
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
- pr_debug("entering trie_leaf_remove(%p)\n", l);
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
- if (tp) {
- put_child(tp, get_index(l->key, tp), NULL);
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->leaf)) {
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
trie_rebalance(t, tp);
- } else {
- RCU_INIT_POINTER(t->trie, NULL);
+ return;
}
- node_free(l);
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
+
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ leaf_pull_suffix(tp, l);
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
- u32 key, mask;
- int plen = cfg->fc_dst_len;
- u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
- struct list_head *fa_head;
- struct tnode *l;
- struct leaf_info *li;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
- mask = ntohl(inet_make_mask(plen));
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
- l = fib_find_node(t, key);
-
+ l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
- li = find_leaf_info(l, plen);
-
- if (!li)
- return -ESRCH;
-
- fa_head = &li->falh;
- fa = fib_find_alias(fa_head, tos, 0);
-
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
if (!fa)
return -ESRCH;
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
+ hlist_for_each_entry_from(fa, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (fa->fa_tos != tos)
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -1557,240 +1517,397 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
- &cfg->fc_nlinfo, 0);
+ netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
- list_del_rcu(&fa->fa_list);
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
if (!plen)
tb->tb_num_default--;
- if (list_empty(fa_head)) {
- remove_leaf_info(l, li);
- free_leaf_info(li);
- }
+ fib_remove_alias(t, tp, l, fa_to_delete);
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, l);
-
- if (fa->fa_state & FA_S_ACCESSED)
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
return 0;
}
-static int trie_flush_list(struct list_head *head)
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
{
- struct fib_alias *fa, *fa_node;
- int found = 0;
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
- list_for_each_entry_safe(fa, fa_node, head, fa_list) {
- struct fib_info *fi = fa->fa_info;
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = key ? get_index(key, pn) : 0;
- if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
- found++;
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
}
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
}
- return found;
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
}
-static int trie_flush_leaf(struct tnode *l)
+static void fib_trie_free(struct fib_table *tb)
{
- int found = 0;
- struct hlist_head *lih = &l->list;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
struct hlist_node *tmp;
- struct leaf_info *li = NULL;
- unsigned char plen = KEYLENGTH;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
- hlist_for_each_entry_safe(li, tmp, lih, hlist) {
- found += trie_flush_list(&li->falh);
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
- if (list_empty(&li->falh)) {
- hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
continue;
}
- plen = li->plen;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- l->slen = KEYLENGTH - plen;
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- return found;
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ }
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
}
-/*
- * Scan for the next right leaf starting at node p->child[idx]
- * Since we have back pointer, no recursion necessary.
- */
-static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
- do {
- unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
- while (idx < tnode_child_length(p)) {
- c = tnode_get_child_rcu(p, idx++);
- if (!c)
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
continue;
- if (IS_LEAF(c))
- return c;
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ memcpy(new_fa, fa, sizeof(*fa));
- /* Rescan start scanning in new node */
- p = c;
- idx = 0;
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key))
+ goto out;
}
- /* Node empty, walk back up to parent */
- c = p;
- } while ((p = node_parent_rcu(c)) != NULL);
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
+ }
- return NULL; /* Root of trie */
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
+
+ return NULL;
}
-static struct tnode *trie_firstleaf(struct trie *t)
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
- if (!n)
- return NULL;
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- if (IS_LEAF(n)) /* trie is just a leaf */
- return n;
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- return leaf_walk_rcu(n, NULL);
-}
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
-static struct tnode *trie_nextleaf(struct tnode *l)
-{
- struct tnode *p = node_parent_rcu(l);
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
- if (!p)
- return NULL; /* trie with just one leaf */
+ continue;
+ }
- return leaf_walk_rcu(p, l);
-}
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
-static struct tnode *trie_leafindex(struct trie *t, int index)
-{
- struct tnode *l = trie_firstleaf(t);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- while (l && index-- > 0)
- l = trie_nextleaf(l);
+ continue;
+ }
- return l;
-}
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+ /* record local slen */
+ slen = fa->fa_slen;
-/*
- * Caller must hold RTNL.
- */
+ if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
+ continue;
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+}
+
+/* Caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
- struct trie *t = (struct trie *) tb->tb_data;
- struct tnode *l, *ll = NULL;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
int found = 0;
- for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
- found += trie_flush_leaf(l);
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
}
- ll = l;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
}
pr_debug("trie_flush found=%d\n", found);
return found;
}
-void fib_free_table(struct fib_table *tb)
+static void __trie_free_rcu(struct rcu_head *head)
{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie *t = (struct trie *)tb->tb_data;
- free_percpu(t->stats);
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
- struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+void fib_free_table(struct fib_table *tb)
{
- int i, s_i;
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ __be32 xkey = htonl(l->key);
struct fib_alias *fa;
- __be32 xkey = htonl(key);
+ int i, s_i;
- s_i = cb->args[5];
+ s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
-
- list_for_each_entry_rcu(fa, fah, fa_list) {
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (i < s_i) {
i++;
continue;
}
+ if (tb->tb_id != fa->tb_id) {
+ i++;
+ continue;
+ }
+
if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE,
tb->tb_id,
fa->fa_type,
xkey,
- plen,
+ KEYLENGTH - fa->fa_slen,
fa->fa_tos,
fa->fa_info, NLM_F_MULTI) < 0) {
- cb->args[5] = i;
- return -1;
- }
- i++;
- }
- cb->args[5] = i;
- return skb->len;
-}
-
-static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct leaf_info *li;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
-
- /* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- if (i < s_i) {
- i++;
- continue;
- }
-
- if (i > s_i)
- cb->args[5] = 0;
-
- if (list_empty(&li->falh))
- continue;
-
- if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
cb->args[4] = i;
return -1;
}
@@ -1801,44 +1918,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
return skb->len;
}
+/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct tnode *l;
- struct trie *t = (struct trie *) tb->tb_data;
- t_key key = cb->args[2];
- int count = cb->args[3];
-
- rcu_read_lock();
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
- if (count == 0)
- l = trie_firstleaf(t);
- else {
- /* Normally, continue from last key, but if that is missing
- * fallback to using slow rescan
- */
- l = fib_find_node(t, key);
- if (!l)
- l = trie_leafindex(t, count);
- }
+ int count = cb->args[2];
+ t_key key = cb->args[3];
- while (l) {
- cb->args[2] = l->key;
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
- cb->args[3] = count;
- rcu_read_unlock();
+ cb->args[3] = key;
+ cb->args[2] = count;
return -1;
}
++count;
- l = trie_nextleaf(l);
+ key = l->key + 1;
+
memset(&cb->args[4], 0,
sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
}
- cb->args[3] = count;
- rcu_read_unlock();
+
+ cb->args[3] = key;
+ cb->args[2] = count;
return skb->len;
}
@@ -1850,28 +1961,34 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
- max(sizeof(struct tnode),
- sizeof(struct leaf_info)),
+ LEAF_SIZE,
0, SLAB_PANIC, NULL);
}
-
-struct fib_table *fib_trie_table(u32 id)
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
struct trie *t;
+ size_t sz = sizeof(*tb);
+
+ if (!alias)
+ sz += sizeof(struct trie);
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
- GFP_KERNEL);
- if (tb == NULL)
+ tb = kzalloc(sz, GFP_KERNEL);
+ if (!tb)
return NULL;
tb->tb_id = id;
tb->tb_default = -1;
tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
+
+ if (alias)
+ return tb;
t = (struct trie *) tb->tb_data;
- RCU_INIT_POINTER(t->trie, NULL);
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats = alloc_percpu(struct trie_use_stats);
if (!t->stats) {
@@ -1888,65 +2005,63 @@ struct fib_table *fib_trie_table(u32 id)
struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
- struct tnode *tnode;
+ struct key_vector *tnode;
unsigned int index;
unsigned int depth;
};
-static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
unsigned long cindex = iter->index;
- struct tnode *tn = iter->tnode;
- struct tnode *p;
-
- /* A single entry routing table */
- if (!tn)
- return NULL;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
-rescan:
- while (cindex < tnode_child_length(tn)) {
- struct tnode *n = tnode_get_child_rcu(tn, cindex);
- if (n) {
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
if (IS_LEAF(n)) {
- iter->tnode = tn;
- iter->index = cindex + 1;
+ iter->tnode = pn;
+ iter->index = cindex;
} else {
/* push down one level */
iter->tnode = n;
iter->index = 0;
++iter->depth;
}
+
return n;
}
- ++cindex;
- }
-
- /* Current node exhausted, pop back up */
- p = node_parent_rcu(tn);
- if (p) {
- cindex = get_index(tn->key, p) + 1;
- tn = p;
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
--iter->depth;
- goto rescan;
}
- /* got root? */
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
return NULL;
}
-static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
- struct trie *t)
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- struct tnode *n;
+ struct key_vector *n, *pn = t->kv;
if (!t)
return NULL;
- n = rcu_dereference(t->trie);
+ n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
@@ -1955,7 +2070,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
iter->index = 0;
iter->depth = 1;
} else {
- iter->tnode = NULL;
+ iter->tnode = pn;
iter->index = 0;
iter->depth = 0;
}
@@ -1965,7 +2080,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct tnode *n;
+ struct key_vector *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -1973,20 +2088,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
- struct leaf_info *li;
+ struct fib_alias *fa;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, &n->list, hlist)
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
++s->prefixes;
} else {
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
- s->nullpointers += n->empty_children;
+ s->nullpointers += tn_info(n)->empty_children;
}
}
rcu_read_unlock();
@@ -2009,13 +2124,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- bytes = sizeof(struct tnode) * stat->leaves;
+ bytes = LEAF_SIZE * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
- bytes += sizeof(struct leaf_info) * stat->prefixes;
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
+ bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0)
@@ -2030,7 +2145,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct tnode *) * pointers;
+ bytes += sizeof(struct key_vector *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2084,7 +2199,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct tnode), sizeof(struct tnode));
+ LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2123,7 +2238,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2135,7 +2250,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- struct tnode *n;
+ struct key_vector *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2164,7 +2279,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct tnode *n;
+ struct key_vector *n;
++*pos;
/* next node in same table */
@@ -2250,9 +2365,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct tnode *n = v;
+ struct key_vector *n = v;
- if (!node_parent_rcu(n))
+ if (IS_TRIE(node_parent_rcu(n)))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
@@ -2261,30 +2376,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
&prf, KEYLENGTH - n->pos - n->bits, n->bits,
- n->full_children, n->empty_children);
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
} else {
- struct leaf_info *li;
__be32 val = htonl(n->key);
+ struct fib_alias *fa;
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- char buf1[32], buf2[32];
-
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", li->plen,
- rtn_scope(buf1, sizeof(buf1),
- fa->fa_info->fib_scope),
- rtn_type(buf2, sizeof(buf2),
- fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
- seq_putc(seq, '\n');
- }
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
@@ -2314,31 +2427,47 @@ static const struct file_operations fib_trie_fops = {
struct fib_route_iter {
struct seq_net_private p;
- struct trie *main_trie;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
loff_t pos;
t_key key;
};
-static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
{
- struct tnode *l = NULL;
- struct trie *t = iter->main_trie;
+ struct fib_table *tb = iter->main_tb;
+ struct key_vector *l, **tp = &iter->tnode;
+ struct trie *t;
+ t_key key;
- /* use cache location of last found key */
- if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+ /* use cache location of next-to-find key */
+ if (iter->pos > 0 && pos >= iter->pos) {
pos -= iter->pos;
- else {
+ key = iter->key;
+ } else {
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
iter->pos = 0;
- l = trie_firstleaf(t);
+ key = 0;
}
- while (l && pos-- > 0) {
+ while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+
+ if (pos-- <= 0)
+ break;
+
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
}
if (l)
- iter->key = pos; /* remember it */
+ iter->key = key; /* remember it */
else
iter->pos = 0; /* forget it */
@@ -2350,37 +2479,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct fib_table *tb;
+ struct trie *t;
rcu_read_lock();
+
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
if (!tb)
return NULL;
- iter->main_trie = (struct trie *) tb->tb_data;
- if (*pos == 0)
- return SEQ_START_TOKEN;
- else
- return fib_route_get_idx(iter, *pos - 1);
+ iter->main_tb = tb;
+
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ iter->key = 0;
+
+ return SEQ_START_TOKEN;
}
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
- struct tnode *l = v;
+ struct key_vector *l = NULL;
+ t_key key = iter->key;
++*pos;
- if (v == SEQ_START_TOKEN) {
- iter->pos = 0;
- l = trie_firstleaf(iter->main_trie);
- } else {
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+ } else {
+ iter->pos = 0;
}
- if (l)
- iter->key = l->key;
- else
- iter->pos = 0;
return l;
}
@@ -2412,8 +2550,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- struct tnode *l = v;
- struct leaf_info *li;
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2422,45 +2563,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- struct fib_alias *fa;
- __be32 mask, prefix;
+ prefix = htonl(l->key);
- mask = inet_make_mask(li->plen);
- prefix = htonl(l->key);
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- const struct fib_info *fi = fa->fa_info;
- unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
+ continue;
- if (fa->fa_type == RTN_BROADCAST
- || fa->fa_type == RTN_MULTICAST)
- continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
- seq_setwidth(seq, 127);
-
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->nh_gw, flags, 0, 0,
- fi->fib_priority,
- mask,
- (fi->fib_advmss ?
- fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0);
-
- seq_pad(seq, '\n');
- }
+ seq_setwidth(seq, 127);
+
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_pad(seq, '\n');
}
return 0;