mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-25 05:47:00 +00:00
238 lines
8.4 KiB
Diff
238 lines
8.4 KiB
Diff
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||
|
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
|
||
|
Date: Fri, 4 Jun 2021 17:17:36 +0200
|
||
|
Subject: [PATCH] wireguard: allowedips: remove nodes in O(1)
|
||
|
|
||
|
commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream.
|
||
|
|
||
|
Previously, deleting peers would require traversing the entire trie in
|
||
|
order to rebalance nodes and safely free them. This meant that removing
|
||
|
1000 peers from a trie with a half million nodes would take an extremely
|
||
|
long time, during which we're holding the rtnl lock. Large-scale users
|
||
|
were reporting 200ms latencies added to the networking stack as a whole
|
||
|
every time their userspace software would queue up significant removals.
|
||
|
That's a serious situation.
|
||
|
|
||
|
This commit fixes that by maintaining a double pointer to the parent's
|
||
|
bit pointer for each node, and then using the already existing node list
|
||
|
belonging to each peer to go directly to the node, fix up its pointers,
|
||
|
and free it with RCU. This means removal is O(1) instead of O(n), and we
|
||
|
don't use gobs of stack.
|
||
|
|
||
|
The removal algorithm has the same downside as the code that it fixes:
|
||
|
it won't collapse needlessly long runs of fillers. We can enhance that
|
||
|
in the future if it ever becomes a problem. This commit documents that
|
||
|
limitation with a TODO comment in code, a small but meaningful
|
||
|
improvement over the prior situation.
|
||
|
|
||
|
Currently the biggest flaw, which the next commit addresses, is that
|
||
|
because this increases the node size on 64-bit machines from 60 bytes to
|
||
|
68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
|
||
|
using twice as much memory per node, because of power-of-two
|
||
|
allocations, which is a big bummer. We'll need to figure something out
|
||
|
there.
|
||
|
|
||
|
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
|
||
|
Cc: stable@vger.kernel.org
|
||
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
||
|
---
|
||
|
drivers/net/wireguard/allowedips.c | 132 ++++++++++++-----------------
|
||
|
drivers/net/wireguard/allowedips.h | 9 +-
|
||
|
2 files changed, 57 insertions(+), 84 deletions(-)
|
||
|
|
||
|
--- a/drivers/net/wireguard/allowedips.c
|
||
|
+++ b/drivers/net/wireguard/allowedips.c
|
||
|
@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc
|
||
|
}
|
||
|
}
|
||
|
|
||
|
-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
|
||
|
- struct wg_peer *peer, struct mutex *lock)
|
||
|
-{
|
||
|
-#define REF(p) rcu_access_pointer(p)
|
||
|
-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
|
||
|
-#define PUSH(p) ({ \
|
||
|
- WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
|
||
|
- stack[len++] = p; \
|
||
|
- })
|
||
|
-
|
||
|
- struct allowedips_node __rcu **stack[128], **nptr;
|
||
|
- struct allowedips_node *node, *prev;
|
||
|
- unsigned int len;
|
||
|
-
|
||
|
- if (unlikely(!peer || !REF(*top)))
|
||
|
- return;
|
||
|
-
|
||
|
- for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
|
||
|
- nptr = stack[len - 1];
|
||
|
- node = DEREF(nptr);
|
||
|
- if (!node) {
|
||
|
- --len;
|
||
|
- continue;
|
||
|
- }
|
||
|
- if (!prev || REF(prev->bit[0]) == node ||
|
||
|
- REF(prev->bit[1]) == node) {
|
||
|
- if (REF(node->bit[0]))
|
||
|
- PUSH(&node->bit[0]);
|
||
|
- else if (REF(node->bit[1]))
|
||
|
- PUSH(&node->bit[1]);
|
||
|
- } else if (REF(node->bit[0]) == prev) {
|
||
|
- if (REF(node->bit[1]))
|
||
|
- PUSH(&node->bit[1]);
|
||
|
- } else {
|
||
|
- if (rcu_dereference_protected(node->peer,
|
||
|
- lockdep_is_held(lock)) == peer) {
|
||
|
- RCU_INIT_POINTER(node->peer, NULL);
|
||
|
- list_del_init(&node->peer_list);
|
||
|
- if (!node->bit[0] || !node->bit[1]) {
|
||
|
- rcu_assign_pointer(*nptr, DEREF(
|
||
|
- &node->bit[!REF(node->bit[0])]));
|
||
|
- kfree_rcu(node, rcu);
|
||
|
- node = DEREF(nptr);
|
||
|
- }
|
||
|
- }
|
||
|
- --len;
|
||
|
- }
|
||
|
- }
|
||
|
-
|
||
|
-#undef REF
|
||
|
-#undef DEREF
|
||
|
-#undef PUSH
|
||
|
-}
|
||
|
-
|
||
|
static unsigned int fls128(u64 a, u64 b)
|
||
|
{
|
||
|
return a ? fls64(a) + 64U : fls64(b);
|
||
|
@@ -224,6 +170,7 @@ static int add(struct allowedips_node __
|
||
|
RCU_INIT_POINTER(node->peer, peer);
|
||
|
list_add_tail(&node->peer_list, &peer->allowedips_list);
|
||
|
copy_and_assign_cidr(node, key, cidr, bits);
|
||
|
+ rcu_assign_pointer(node->parent_bit, trie);
|
||
|
rcu_assign_pointer(*trie, node);
|
||
|
return 0;
|
||
|
}
|
||
|
@@ -243,9 +190,9 @@ static int add(struct allowedips_node __
|
||
|
if (!node) {
|
||
|
down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
|
||
|
} else {
|
||
|
- down = rcu_dereference_protected(CHOOSE_NODE(node, key),
|
||
|
- lockdep_is_held(lock));
|
||
|
+ down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
|
||
|
if (!down) {
|
||
|
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
|
||
|
rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
|
||
|
return 0;
|
||
|
}
|
||
|
@@ -254,29 +201,37 @@ static int add(struct allowedips_node __
|
||
|
parent = node;
|
||
|
|
||
|
if (newnode->cidr == cidr) {
|
||
|
+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
|
||
|
rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
|
||
|
- if (!parent)
|
||
|
+ if (!parent) {
|
||
|
+ rcu_assign_pointer(newnode->parent_bit, trie);
|
||
|
rcu_assign_pointer(*trie, newnode);
|
||
|
- else
|
||
|
- rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
|
||
|
- newnode);
|
||
|
- } else {
|
||
|
- node = kzalloc(sizeof(*node), GFP_KERNEL);
|
||
|
- if (unlikely(!node)) {
|
||
|
- list_del(&newnode->peer_list);
|
||
|
- kfree(newnode);
|
||
|
- return -ENOMEM;
|
||
|
+ } else {
|
||
|
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
|
||
|
+ rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
|
||
|
}
|
||
|
- INIT_LIST_HEAD(&node->peer_list);
|
||
|
- copy_and_assign_cidr(node, newnode->bits, cidr, bits);
|
||
|
+ return 0;
|
||
|
+ }
|
||
|
+
|
||
|
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
|
||
|
+ if (unlikely(!node)) {
|
||
|
+ list_del(&newnode->peer_list);
|
||
|
+ kfree(newnode);
|
||
|
+ return -ENOMEM;
|
||
|
+ }
|
||
|
+ INIT_LIST_HEAD(&node->peer_list);
|
||
|
+ copy_and_assign_cidr(node, newnode->bits, cidr, bits);
|
||
|
|
||
|
- rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
|
||
|
- rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
|
||
|
- if (!parent)
|
||
|
- rcu_assign_pointer(*trie, node);
|
||
|
- else
|
||
|
- rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
|
||
|
- node);
|
||
|
+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
|
||
|
+ rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
|
||
|
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
|
||
|
+ rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
|
||
|
+ if (!parent) {
|
||
|
+ rcu_assign_pointer(node->parent_bit, trie);
|
||
|
+ rcu_assign_pointer(*trie, node);
|
||
|
+ } else {
|
||
|
+ rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
|
||
|
+ rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow
|
||
|
void wg_allowedips_remove_by_peer(struct allowedips *table,
|
||
|
struct wg_peer *peer, struct mutex *lock)
|
||
|
{
|
||
|
+ struct allowedips_node *node, *child, *tmp;
|
||
|
+
|
||
|
+ if (list_empty(&peer->allowedips_list))
|
||
|
+ return;
|
||
|
++table->seq;
|
||
|
- walk_remove_by_peer(&table->root4, peer, lock);
|
||
|
- walk_remove_by_peer(&table->root6, peer, lock);
|
||
|
+ list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
|
||
|
+ list_del_init(&node->peer_list);
|
||
|
+ RCU_INIT_POINTER(node->peer, NULL);
|
||
|
+ if (node->bit[0] && node->bit[1])
|
||
|
+ continue;
|
||
|
+ child = rcu_dereference_protected(
|
||
|
+ node->bit[!rcu_access_pointer(node->bit[0])],
|
||
|
+ lockdep_is_held(lock));
|
||
|
+ if (child)
|
||
|
+ child->parent_bit = node->parent_bit;
|
||
|
+ *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
|
||
|
+ kfree_rcu(node, rcu);
|
||
|
+
|
||
|
+ /* TODO: Note that we currently don't walk up and down in order to
|
||
|
+ * free any potential filler nodes. This means that this function
|
||
|
+ * doesn't free up as much as it could, which could be revisited
|
||
|
+ * at some point.
|
||
|
+ */
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
|
||
|
--- a/drivers/net/wireguard/allowedips.h
|
||
|
+++ b/drivers/net/wireguard/allowedips.h
|
||
|
@@ -15,14 +15,11 @@ struct wg_peer;
|
||
|
struct allowedips_node {
|
||
|
struct wg_peer __rcu *peer;
|
||
|
struct allowedips_node __rcu *bit[2];
|
||
|
- /* While it may seem scandalous that we waste space for v4,
|
||
|
- * we're alloc'ing to the nearest power of 2 anyway, so this
|
||
|
- * doesn't actually make a difference.
|
||
|
- */
|
||
|
- u8 bits[16] __aligned(__alignof(u64));
|
||
|
u8 cidr, bit_at_a, bit_at_b, bitlen;
|
||
|
+ u8 bits[16] __aligned(__alignof(u64));
|
||
|
|
||
|
- /* Keep rarely used list at bottom to be beyond cache line. */
|
||
|
+ /* Keep rarely used members at bottom to be beyond cache line. */
|
||
|
+ struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
|
||
|
union {
|
||
|
struct list_head peer_list;
|
||
|
struct rcu_head rcu;
|