mirror of
https://github.com/openwrt/openwrt.git
synced 2025-02-22 18:02:46 +00:00
kernel: replace threaded NAPI implementation in 5.10 with upstream backport
This uses a kthread per NAPI instance instead of the workqueue approach Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: maurerr <mariusd84@gmail.com>
This commit is contained in:
parent
b4455a2d2e
commit
b2a02d4385
@ -0,0 +1,88 @@
|
|||||||
|
From: Felix Fietkau <nbd@nbd.name>
|
||||||
|
Date: Mon, 8 Feb 2021 11:34:08 -0800
|
||||||
|
Subject: [PATCH] net: extract napi poll functionality to __napi_poll()
|
||||||
|
|
||||||
|
This commit introduces a new function __napi_poll() which does the main
|
||||||
|
logic of the existing napi_poll() function, and will be called by other
|
||||||
|
functions in later commits.
|
||||||
|
This idea and implementation is done by Felix Fietkau <nbd@nbd.name> and
|
||||||
|
is proposed as part of the patch to move napi work to work_queue
|
||||||
|
context.
|
||||||
|
This commit by itself is a code restructure.
|
||||||
|
|
||||||
|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||||
|
Signed-off-by: Wei Wang <weiwan@google.com>
|
||||||
|
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
|
||||||
|
--- a/net/core/dev.c
|
||||||
|
+++ b/net/core/dev.c
|
||||||
|
@@ -6740,15 +6740,10 @@ void __netif_napi_del(struct napi_struct
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(__netif_napi_del);
|
||||||
|
|
||||||
|
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
||||||
|
+static int __napi_poll(struct napi_struct *n, bool *repoll)
|
||||||
|
{
|
||||||
|
- void *have;
|
||||||
|
int work, weight;
|
||||||
|
|
||||||
|
- list_del_init(&n->poll_list);
|
||||||
|
-
|
||||||
|
- have = netpoll_poll_lock(n);
|
||||||
|
-
|
||||||
|
weight = n->weight;
|
||||||
|
|
||||||
|
/* This NAPI_STATE_SCHED test is for avoiding a race
|
||||||
|
@@ -6768,7 +6763,7 @@ static int napi_poll(struct napi_struct
|
||||||
|
n->poll, work, weight);
|
||||||
|
|
||||||
|
if (likely(work < weight))
|
||||||
|
- goto out_unlock;
|
||||||
|
+ return work;
|
||||||
|
|
||||||
|
/* Drivers must not modify the NAPI state if they
|
||||||
|
* consume the entire weight. In such cases this code
|
||||||
|
@@ -6777,7 +6772,7 @@ static int napi_poll(struct napi_struct
|
||||||
|
*/
|
||||||
|
if (unlikely(napi_disable_pending(n))) {
|
||||||
|
napi_complete(n);
|
||||||
|
- goto out_unlock;
|
||||||
|
+ return work;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n->gro_bitmask) {
|
||||||
|
@@ -6795,12 +6790,29 @@ static int napi_poll(struct napi_struct
|
||||||
|
if (unlikely(!list_empty(&n->poll_list))) {
|
||||||
|
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
|
||||||
|
n->dev ? n->dev->name : "backlog");
|
||||||
|
- goto out_unlock;
|
||||||
|
+ return work;
|
||||||
|
}
|
||||||
|
|
||||||
|
- list_add_tail(&n->poll_list, repoll);
|
||||||
|
+ *repoll = true;
|
||||||
|
+
|
||||||
|
+ return work;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
||||||
|
+{
|
||||||
|
+ bool do_repoll = false;
|
||||||
|
+ void *have;
|
||||||
|
+ int work;
|
||||||
|
+
|
||||||
|
+ list_del_init(&n->poll_list);
|
||||||
|
+
|
||||||
|
+ have = netpoll_poll_lock(n);
|
||||||
|
+
|
||||||
|
+ work = __napi_poll(n, &do_repoll);
|
||||||
|
+
|
||||||
|
+ if (do_repoll)
|
||||||
|
+ list_add_tail(&n->poll_list, repoll);
|
||||||
|
|
||||||
|
-out_unlock:
|
||||||
|
netpoll_poll_unlock(have);
|
||||||
|
|
||||||
|
return work;
|
@ -0,0 +1,261 @@
|
|||||||
|
From: Wei Wang <weiwan@google.com>
|
||||||
|
Date: Mon, 8 Feb 2021 11:34:09 -0800
|
||||||
|
Subject: [PATCH] net: implement threaded-able napi poll loop support
|
||||||
|
|
||||||
|
This patch allows running each napi poll loop inside its own
|
||||||
|
kernel thread.
|
||||||
|
The kthread is created during netif_napi_add() if dev->threaded
|
||||||
|
is set. And threaded mode is enabled in napi_enable(). We will
|
||||||
|
provide a way to set dev->threaded and enable threaded mode
|
||||||
|
without a device up/down in the following patch.
|
||||||
|
|
||||||
|
Once that threaded mode is enabled and the kthread is
|
||||||
|
started, napi_schedule() will wake-up such thread instead
|
||||||
|
of scheduling the softirq.
|
||||||
|
|
||||||
|
The threaded poll loop behaves quite likely the net_rx_action,
|
||||||
|
but it does not have to manipulate local irqs and uses
|
||||||
|
an explicit scheduling point based on netdev_budget.
|
||||||
|
|
||||||
|
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
||||||
|
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
||||||
|
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
|
||||||
|
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||||
|
Signed-off-by: Wei Wang <weiwan@google.com>
|
||||||
|
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
|
||||||
|
--- a/include/linux/netdevice.h
|
||||||
|
+++ b/include/linux/netdevice.h
|
||||||
|
@@ -347,6 +347,7 @@ struct napi_struct {
|
||||||
|
struct list_head dev_list;
|
||||||
|
struct hlist_node napi_hash_node;
|
||||||
|
unsigned int napi_id;
|
||||||
|
+ struct task_struct *thread;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
@@ -357,6 +358,7 @@ enum {
|
||||||
|
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
||||||
|
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
||||||
|
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
||||||
|
+ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
|
||||||
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
@@ -367,6 +369,7 @@ enum {
|
||||||
|
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
||||||
|
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||||
|
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||||
|
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
||||||
|
};
|
||||||
|
|
||||||
|
enum gro_result {
|
||||||
|
@@ -497,20 +500,7 @@ static inline bool napi_complete(struct
|
||||||
|
*/
|
||||||
|
void napi_disable(struct napi_struct *n);
|
||||||
|
|
||||||
|
-/**
|
||||||
|
- * napi_enable - enable NAPI scheduling
|
||||||
|
- * @n: NAPI context
|
||||||
|
- *
|
||||||
|
- * Resume NAPI from being scheduled on this context.
|
||||||
|
- * Must be paired with napi_disable.
|
||||||
|
- */
|
||||||
|
-static inline void napi_enable(struct napi_struct *n)
|
||||||
|
-{
|
||||||
|
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
|
||||||
|
- smp_mb__before_atomic();
|
||||||
|
- clear_bit(NAPI_STATE_SCHED, &n->state);
|
||||||
|
- clear_bit(NAPI_STATE_NPSVC, &n->state);
|
||||||
|
-}
|
||||||
|
+void napi_enable(struct napi_struct *n);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* napi_synchronize - wait until NAPI is not running
|
||||||
|
@@ -1835,6 +1825,8 @@ enum netdev_priv_flags {
|
||||||
|
*
|
||||||
|
* @wol_enabled: Wake-on-LAN is enabled
|
||||||
|
*
|
||||||
|
+ * @threaded: napi threaded mode is enabled
|
||||||
|
+ *
|
||||||
|
* @net_notifier_list: List of per-net netdev notifier block
|
||||||
|
* that follow this device when it is moved
|
||||||
|
* to another network namespace.
|
||||||
|
@@ -2152,6 +2144,7 @@ struct net_device {
|
||||||
|
struct lock_class_key *qdisc_running_key;
|
||||||
|
bool proto_down;
|
||||||
|
unsigned wol_enabled:1;
|
||||||
|
+ unsigned threaded:1;
|
||||||
|
|
||||||
|
struct list_head net_notifier_list;
|
||||||
|
|
||||||
|
--- a/net/core/dev.c
|
||||||
|
+++ b/net/core/dev.c
|
||||||
|
@@ -91,6 +91,7 @@
|
||||||
|
#include <linux/etherdevice.h>
|
||||||
|
#include <linux/ethtool.h>
|
||||||
|
#include <linux/skbuff.h>
|
||||||
|
+#include <linux/kthread.h>
|
||||||
|
#include <linux/bpf.h>
|
||||||
|
#include <linux/bpf_trace.h>
|
||||||
|
#include <net/net_namespace.h>
|
||||||
|
@@ -1488,6 +1489,27 @@ void netdev_notify_peers(struct net_devi
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(netdev_notify_peers);
|
||||||
|
|
||||||
|
+static int napi_threaded_poll(void *data);
|
||||||
|
+
|
||||||
|
+static int napi_kthread_create(struct napi_struct *n)
|
||||||
|
+{
|
||||||
|
+ int err = 0;
|
||||||
|
+
|
||||||
|
+ /* Create and wake up the kthread once to put it in
|
||||||
|
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
|
||||||
|
+ * warning and work with loadavg.
|
||||||
|
+ */
|
||||||
|
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
|
||||||
|
+ n->dev->name, n->napi_id);
|
||||||
|
+ if (IS_ERR(n->thread)) {
|
||||||
|
+ err = PTR_ERR(n->thread);
|
||||||
|
+ pr_err("kthread_run failed with err %d\n", err);
|
||||||
|
+ n->thread = NULL;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return err;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
|
||||||
|
{
|
||||||
|
const struct net_device_ops *ops = dev->netdev_ops;
|
||||||
|
@@ -4242,6 +4264,21 @@ int gro_normal_batch __read_mostly = 8;
|
||||||
|
static inline void ____napi_schedule(struct softnet_data *sd,
|
||||||
|
struct napi_struct *napi)
|
||||||
|
{
|
||||||
|
+ struct task_struct *thread;
|
||||||
|
+
|
||||||
|
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
|
||||||
|
+ /* Paired with smp_mb__before_atomic() in
|
||||||
|
+ * napi_enable(). Use READ_ONCE() to guarantee
|
||||||
|
+ * a complete read on napi->thread. Only call
|
||||||
|
+ * wake_up_process() when it's not NULL.
|
||||||
|
+ */
|
||||||
|
+ thread = READ_ONCE(napi->thread);
|
||||||
|
+ if (thread) {
|
||||||
|
+ wake_up_process(thread);
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
list_add_tail(&napi->poll_list, &sd->poll_list);
|
||||||
|
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
|
||||||
|
}
|
||||||
|
@@ -6693,6 +6730,12 @@ void netif_napi_add(struct net_device *d
|
||||||
|
set_bit(NAPI_STATE_NPSVC, &napi->state);
|
||||||
|
list_add_rcu(&napi->dev_list, &dev->napi_list);
|
||||||
|
napi_hash_add(napi);
|
||||||
|
+ /* Create kthread for this napi if dev->threaded is set.
|
||||||
|
+ * Clear dev->threaded if kthread creation failed so that
|
||||||
|
+ * threaded mode will not be enabled in napi_enable().
|
||||||
|
+ */
|
||||||
|
+ if (dev->threaded && napi_kthread_create(napi))
|
||||||
|
+ dev->threaded = 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(netif_napi_add);
|
||||||
|
|
||||||
|
@@ -6709,9 +6752,28 @@ void napi_disable(struct napi_struct *n)
|
||||||
|
hrtimer_cancel(&n->timer);
|
||||||
|
|
||||||
|
clear_bit(NAPI_STATE_DISABLE, &n->state);
|
||||||
|
+ clear_bit(NAPI_STATE_THREADED, &n->state);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(napi_disable);
|
||||||
|
|
||||||
|
+/**
|
||||||
|
+ * napi_enable - enable NAPI scheduling
|
||||||
|
+ * @n: NAPI context
|
||||||
|
+ *
|
||||||
|
+ * Resume NAPI from being scheduled on this context.
|
||||||
|
+ * Must be paired with napi_disable.
|
||||||
|
+ */
|
||||||
|
+void napi_enable(struct napi_struct *n)
|
||||||
|
+{
|
||||||
|
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
|
||||||
|
+ smp_mb__before_atomic();
|
||||||
|
+ clear_bit(NAPI_STATE_SCHED, &n->state);
|
||||||
|
+ clear_bit(NAPI_STATE_NPSVC, &n->state);
|
||||||
|
+ if (n->dev->threaded && n->thread)
|
||||||
|
+ set_bit(NAPI_STATE_THREADED, &n->state);
|
||||||
|
+}
|
||||||
|
+EXPORT_SYMBOL(napi_enable);
|
||||||
|
+
|
||||||
|
static void flush_gro_hash(struct napi_struct *napi)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
@@ -6737,6 +6799,11 @@ void __netif_napi_del(struct napi_struct
|
||||||
|
|
||||||
|
flush_gro_hash(napi);
|
||||||
|
napi->gro_bitmask = 0;
|
||||||
|
+
|
||||||
|
+ if (napi->thread) {
|
||||||
|
+ kthread_stop(napi->thread);
|
||||||
|
+ napi->thread = NULL;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(__netif_napi_del);
|
||||||
|
|
||||||
|
@@ -6818,6 +6885,51 @@ static int napi_poll(struct napi_struct
|
||||||
|
return work;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static int napi_thread_wait(struct napi_struct *napi)
|
||||||
|
+{
|
||||||
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
+
|
||||||
|
+ while (!kthread_should_stop() && !napi_disable_pending(napi)) {
|
||||||
|
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
|
||||||
|
+ WARN_ON(!list_empty(&napi->poll_list));
|
||||||
|
+ __set_current_state(TASK_RUNNING);
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ schedule();
|
||||||
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
+ }
|
||||||
|
+ __set_current_state(TASK_RUNNING);
|
||||||
|
+ return -1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int napi_threaded_poll(void *data)
|
||||||
|
+{
|
||||||
|
+ struct napi_struct *napi = data;
|
||||||
|
+ void *have;
|
||||||
|
+
|
||||||
|
+ while (!napi_thread_wait(napi)) {
|
||||||
|
+ for (;;) {
|
||||||
|
+ bool repoll = false;
|
||||||
|
+
|
||||||
|
+ local_bh_disable();
|
||||||
|
+
|
||||||
|
+ have = netpoll_poll_lock(napi);
|
||||||
|
+ __napi_poll(napi, &repoll);
|
||||||
|
+ netpoll_poll_unlock(have);
|
||||||
|
+
|
||||||
|
+ __kfree_skb_flush();
|
||||||
|
+ local_bh_enable();
|
||||||
|
+
|
||||||
|
+ if (!repoll)
|
||||||
|
+ break;
|
||||||
|
+
|
||||||
|
+ cond_resched();
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static __latent_entropy void net_rx_action(struct softirq_action *h)
|
||||||
|
{
|
||||||
|
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
|
@ -0,0 +1,177 @@
|
|||||||
|
From: Wei Wang <weiwan@google.com>
|
||||||
|
Date: Mon, 8 Feb 2021 11:34:10 -0800
|
||||||
|
Subject: [PATCH] net: add sysfs attribute to control napi threaded mode
|
||||||
|
|
||||||
|
This patch adds a new sysfs attribute to the network device class.
|
||||||
|
Said attribute provides a per-device control to enable/disable the
|
||||||
|
threaded mode for all the napi instances of the given network device,
|
||||||
|
without the need for a device up/down.
|
||||||
|
User sets it to 1 or 0 to enable or disable threaded mode.
|
||||||
|
Note: when switching between threaded and the current softirq based mode
|
||||||
|
for a napi instance, it will not immediately take effect if the napi is
|
||||||
|
currently being polled. The mode switch will happen for the next time
|
||||||
|
napi_schedule() is called.
|
||||||
|
|
||||||
|
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
||||||
|
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
||||||
|
Co-developed-by: Felix Fietkau <nbd@nbd.name>
|
||||||
|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||||
|
Signed-off-by: Wei Wang <weiwan@google.com>
|
||||||
|
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
|
||||||
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||||
|
---
|
||||||
|
|
||||||
|
--- a/Documentation/ABI/testing/sysfs-class-net
|
||||||
|
+++ b/Documentation/ABI/testing/sysfs-class-net
|
||||||
|
@@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
32-bit unsigned integer counting the number of times the link has
|
||||||
|
been down
|
||||||
|
+
|
||||||
|
+What: /sys/class/net/<iface>/threaded
|
||||||
|
+Date: Jan 2021
|
||||||
|
+KernelVersion: 5.12
|
||||||
|
+Contact: netdev@vger.kernel.org
|
||||||
|
+Description:
|
||||||
|
+ Boolean value to control the threaded mode per device. User could
|
||||||
|
+ set this value to enable/disable threaded mode for all napi
|
||||||
|
+ belonging to this device, without the need to do device up/down.
|
||||||
|
+
|
||||||
|
+ Possible values:
|
||||||
|
+ == ==================================
|
||||||
|
+ 0 threaded mode disabled for this dev
|
||||||
|
+ 1 threaded mode enabled for this dev
|
||||||
|
+ == ==================================
|
||||||
|
--- a/include/linux/netdevice.h
|
||||||
|
+++ b/include/linux/netdevice.h
|
||||||
|
@@ -491,6 +491,8 @@ static inline bool napi_complete(struct
|
||||||
|
return napi_complete_done(n, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
+int dev_set_threaded(struct net_device *dev, bool threaded);
|
||||||
|
+
|
||||||
|
/**
|
||||||
|
* napi_disable - prevent NAPI from scheduling
|
||||||
|
* @n: NAPI context
|
||||||
|
--- a/net/core/dev.c
|
||||||
|
+++ b/net/core/dev.c
|
||||||
|
@@ -4268,8 +4268,9 @@ static inline void ____napi_schedule(str
|
||||||
|
|
||||||
|
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
|
||||||
|
/* Paired with smp_mb__before_atomic() in
|
||||||
|
- * napi_enable(). Use READ_ONCE() to guarantee
|
||||||
|
- * a complete read on napi->thread. Only call
|
||||||
|
+ * napi_enable()/dev_set_threaded().
|
||||||
|
+ * Use READ_ONCE() to guarantee a complete
|
||||||
|
+ * read on napi->thread. Only call
|
||||||
|
* wake_up_process() when it's not NULL.
|
||||||
|
*/
|
||||||
|
thread = READ_ONCE(napi->thread);
|
||||||
|
@@ -6703,6 +6704,49 @@ static void init_gro_hash(struct napi_st
|
||||||
|
napi->gro_bitmask = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
+int dev_set_threaded(struct net_device *dev, bool threaded)
|
||||||
|
+{
|
||||||
|
+ struct napi_struct *napi;
|
||||||
|
+ int err = 0;
|
||||||
|
+
|
||||||
|
+ if (dev->threaded == threaded)
|
||||||
|
+ return 0;
|
||||||
|
+
|
||||||
|
+ if (threaded) {
|
||||||
|
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
||||||
|
+ if (!napi->thread) {
|
||||||
|
+ err = napi_kthread_create(napi);
|
||||||
|
+ if (err) {
|
||||||
|
+ threaded = false;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ dev->threaded = threaded;
|
||||||
|
+
|
||||||
|
+ /* Make sure kthread is created before THREADED bit
|
||||||
|
+ * is set.
|
||||||
|
+ */
|
||||||
|
+ smp_mb__before_atomic();
|
||||||
|
+
|
||||||
|
+ /* Setting/unsetting threaded mode on a napi might not immediately
|
||||||
|
+ * take effect, if the current napi instance is actively being
|
||||||
|
+ * polled. In this case, the switch between threaded mode and
|
||||||
|
+ * softirq mode will happen in the next round of napi_schedule().
|
||||||
|
+ * This should not cause hiccups/stalls to the live traffic.
|
||||||
|
+ */
|
||||||
|
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
||||||
|
+ if (threaded)
|
||||||
|
+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
||||||
|
+ else
|
||||||
|
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return err;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
|
||||||
|
int (*poll)(struct napi_struct *, int), int weight)
|
||||||
|
{
|
||||||
|
--- a/net/core/net-sysfs.c
|
||||||
|
+++ b/net/core/net-sysfs.c
|
||||||
|
@@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struc
|
||||||
|
}
|
||||||
|
static DEVICE_ATTR_RO(phys_switch_id);
|
||||||
|
|
||||||
|
+static ssize_t threaded_show(struct device *dev,
|
||||||
|
+ struct device_attribute *attr, char *buf)
|
||||||
|
+{
|
||||||
|
+ struct net_device *netdev = to_net_dev(dev);
|
||||||
|
+ ssize_t ret = -EINVAL;
|
||||||
|
+
|
||||||
|
+ if (!rtnl_trylock())
|
||||||
|
+ return restart_syscall();
|
||||||
|
+
|
||||||
|
+ if (dev_isalive(netdev))
|
||||||
|
+ ret = sprintf(buf, fmt_dec, netdev->threaded);
|
||||||
|
+
|
||||||
|
+ rtnl_unlock();
|
||||||
|
+ return ret;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
|
||||||
|
+{
|
||||||
|
+ int ret;
|
||||||
|
+
|
||||||
|
+ if (list_empty(&dev->napi_list))
|
||||||
|
+ return -EOPNOTSUPP;
|
||||||
|
+
|
||||||
|
+ if (val != 0 && val != 1)
|
||||||
|
+ return -EOPNOTSUPP;
|
||||||
|
+
|
||||||
|
+ ret = dev_set_threaded(dev, val);
|
||||||
|
+
|
||||||
|
+ return ret;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static ssize_t threaded_store(struct device *dev,
|
||||||
|
+ struct device_attribute *attr,
|
||||||
|
+ const char *buf, size_t len)
|
||||||
|
+{
|
||||||
|
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
|
||||||
|
+}
|
||||||
|
+static DEVICE_ATTR_RW(threaded);
|
||||||
|
+
|
||||||
|
static struct attribute *net_class_attrs[] __ro_after_init = {
|
||||||
|
&dev_attr_netdev_group.attr,
|
||||||
|
&dev_attr_type.attr,
|
||||||
|
@@ -570,6 +609,7 @@ static struct attribute *net_class_attrs
|
||||||
|
&dev_attr_proto_down.attr,
|
||||||
|
&dev_attr_carrier_up_count.attr,
|
||||||
|
&dev_attr_carrier_down_count.attr,
|
||||||
|
+ &dev_attr_threaded.attr,
|
||||||
|
NULL,
|
||||||
|
};
|
||||||
|
ATTRIBUTE_GROUPS(net_class);
|
@ -0,0 +1,93 @@
|
|||||||
|
From: Wei Wang <weiwan@google.com>
|
||||||
|
Date: Mon, 1 Mar 2021 17:21:13 -0800
|
||||||
|
Subject: [PATCH] net: fix race between napi kthread mode and busy poll
|
||||||
|
|
||||||
|
Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to
|
||||||
|
determine if the kthread owns this napi and could call napi->poll() on
|
||||||
|
it. However, if socket busy poll is enabled, it is possible that the
|
||||||
|
busy poll thread grabs this SCHED bit (after the previous napi->poll()
|
||||||
|
invokes napi_complete_done() and clears SCHED bit) and tries to poll
|
||||||
|
on the same napi. napi_disable() could grab the SCHED bit as well.
|
||||||
|
This patch tries to fix this race by adding a new bit
|
||||||
|
NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in
|
||||||
|
____napi_schedule() if the threaded mode is enabled, and gets cleared
|
||||||
|
in napi_complete_done(), and we only poll the napi in kthread if this
|
||||||
|
bit is set. This helps distinguish the ownership of the napi between
|
||||||
|
kthread and other scenarios and fixes the race issue.
|
||||||
|
|
||||||
|
Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
|
||||||
|
Reported-by: Martin Zaharinov <micron10@gmail.com>
|
||||||
|
Suggested-by: Jakub Kicinski <kuba@kernel.org>
|
||||||
|
Signed-off-by: Wei Wang <weiwan@google.com>
|
||||||
|
Cc: Alexander Duyck <alexanderduyck@fb.com>
|
||||||
|
Cc: Eric Dumazet <edumazet@google.com>
|
||||||
|
Cc: Paolo Abeni <pabeni@redhat.com>
|
||||||
|
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
||||||
|
---
|
||||||
|
|
||||||
|
--- a/include/linux/netdevice.h
|
||||||
|
+++ b/include/linux/netdevice.h
|
||||||
|
@@ -359,6 +359,7 @@ enum {
|
||||||
|
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
||||||
|
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
||||||
|
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
|
||||||
|
+ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
|
||||||
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
@@ -370,6 +371,7 @@ enum {
|
||||||
|
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||||
|
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||||
|
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
||||||
|
+ NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
|
||||||
|
};
|
||||||
|
|
||||||
|
enum gro_result {
|
||||||
|
--- a/net/core/dev.c
|
||||||
|
+++ b/net/core/dev.c
|
||||||
|
@@ -4275,6 +4275,8 @@ static inline void ____napi_schedule(str
|
||||||
|
*/
|
||||||
|
thread = READ_ONCE(napi->thread);
|
||||||
|
if (thread) {
|
||||||
|
+ if (thread->state != TASK_INTERRUPTIBLE)
|
||||||
|
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
|
||||||
|
wake_up_process(thread);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
@@ -6495,7 +6497,8 @@ bool napi_complete_done(struct napi_stru
|
||||||
|
|
||||||
|
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
|
||||||
|
|
||||||
|
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
|
||||||
|
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
|
||||||
|
+ NAPIF_STATE_SCHED_THREADED);
|
||||||
|
|
||||||
|
/* If STATE_MISSED was set, leave STATE_SCHED set,
|
||||||
|
* because we will call napi->poll() one more time.
|
||||||
|
@@ -6931,16 +6934,25 @@ static int napi_poll(struct napi_struct
|
||||||
|
|
||||||
|
static int napi_thread_wait(struct napi_struct *napi)
|
||||||
|
{
|
||||||
|
+ bool woken = false;
|
||||||
|
+
|
||||||
|
set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
|
||||||
|
while (!kthread_should_stop() && !napi_disable_pending(napi)) {
|
||||||
|
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
|
||||||
|
+ /* Testing SCHED_THREADED bit here to make sure the current
|
||||||
|
+ * kthread owns this napi and could poll on this napi.
|
||||||
|
+ * Testing SCHED bit is not enough because SCHED bit might be
|
||||||
|
+ * set by some other busy poll thread or by napi_disable().
|
||||||
|
+ */
|
||||||
|
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
|
||||||
|
WARN_ON(!list_empty(&napi->poll_list));
|
||||||
|
__set_current_state(TASK_RUNNING);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
schedule();
|
||||||
|
+ /* woken being true indicates this thread owns this napi. */
|
||||||
|
+ woken = true;
|
||||||
|
set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
}
|
||||||
|
__set_current_state(TASK_RUNNING);
|
@ -58,7 +58,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -833,6 +833,27 @@ typedef u16 (*select_queue_fallback_t)(s
|
@@ -827,6 +827,27 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||||
struct sk_buff *skb,
|
struct sk_buff *skb,
|
||||||
struct net_device *sb_dev);
|
struct net_device *sb_dev);
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
enum tc_setup_type {
|
enum tc_setup_type {
|
||||||
TC_SETUP_QDISC_MQPRIO,
|
TC_SETUP_QDISC_MQPRIO,
|
||||||
TC_SETUP_CLSU32,
|
TC_SETUP_CLSU32,
|
||||||
@@ -1279,6 +1300,8 @@ struct netdev_net_notifier {
|
@@ -1273,6 +1294,8 @@ struct netdev_net_notifier {
|
||||||
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
|
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
|
||||||
* If a device is paired with a peer device, return the peer instance.
|
* If a device is paired with a peer device, return the peer instance.
|
||||||
* The caller must be under RCU read context.
|
* The caller must be under RCU read context.
|
||||||
@ -95,7 +95,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
*/
|
*/
|
||||||
struct net_device_ops {
|
struct net_device_ops {
|
||||||
int (*ndo_init)(struct net_device *dev);
|
int (*ndo_init)(struct net_device *dev);
|
||||||
@@ -1487,6 +1510,8 @@ struct net_device_ops {
|
@@ -1481,6 +1504,8 @@ struct net_device_ops {
|
||||||
int (*ndo_tunnel_ctl)(struct net_device *dev,
|
int (*ndo_tunnel_ctl)(struct net_device *dev,
|
||||||
struct ip_tunnel_parm *p, int cmd);
|
struct ip_tunnel_parm *p, int cmd);
|
||||||
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
|
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
|
||||||
@ -104,7 +104,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -2798,6 +2823,8 @@ void dev_remove_offload(struct packet_of
|
@@ -2795,6 +2820,8 @@ void dev_remove_offload(struct packet_of
|
||||||
|
|
||||||
int dev_get_iflink(const struct net_device *dev);
|
int dev_get_iflink(const struct net_device *dev);
|
||||||
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
|
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
|
||||||
@ -115,7 +115,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
struct net_device *dev_get_by_name(struct net *net, const char *name);
|
struct net_device *dev_get_by_name(struct net *net, const char *name);
|
||||||
--- a/net/core/dev.c
|
--- a/net/core/dev.c
|
||||||
+++ b/net/core/dev.c
|
+++ b/net/core/dev.c
|
||||||
@@ -846,6 +846,52 @@ int dev_fill_metadata_dst(struct net_dev
|
@@ -847,6 +847,52 @@ int dev_fill_metadata_dst(struct net_dev
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
|
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -835,11 +835,18 @@ typedef u16 (*select_queue_fallback_t)(s
|
@@ -829,11 +829,18 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||||
|
|
||||||
enum net_device_path_type {
|
enum net_device_path_type {
|
||||||
DEV_PATH_ETHERNET = 0,
|
DEV_PATH_ETHERNET = 0,
|
||||||
|
@ -9,7 +9,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -836,6 +836,7 @@ typedef u16 (*select_queue_fallback_t)(s
|
@@ -830,6 +830,7 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||||
enum net_device_path_type {
|
enum net_device_path_type {
|
||||||
DEV_PATH_ETHERNET = 0,
|
DEV_PATH_ETHERNET = 0,
|
||||||
DEV_PATH_VLAN,
|
DEV_PATH_VLAN,
|
||||||
|
@ -15,7 +15,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -847,10 +847,20 @@ struct net_device_path {
|
@@ -841,10 +841,20 @@ struct net_device_path {
|
||||||
u16 id;
|
u16 id;
|
||||||
__be16 proto;
|
__be16 proto;
|
||||||
} encap;
|
} encap;
|
||||||
@ -36,7 +36,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||||||
|
|
||||||
struct net_device_path_stack {
|
struct net_device_path_stack {
|
||||||
int num_paths;
|
int num_paths;
|
||||||
@@ -860,6 +870,12 @@ struct net_device_path_stack {
|
@@ -854,6 +864,12 @@ struct net_device_path_stack {
|
||||||
struct net_device_path_ctx {
|
struct net_device_path_ctx {
|
||||||
const struct net_device *dev;
|
const struct net_device *dev;
|
||||||
const u8 *daddr;
|
const u8 *daddr;
|
||||||
|
@ -78,7 +78,7 @@ Pass on the PPPoE session ID and the real device.
|
|||||||
static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
|
static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -837,6 +837,7 @@ enum net_device_path_type {
|
@@ -831,6 +831,7 @@ enum net_device_path_type {
|
||||||
DEV_PATH_ETHERNET = 0,
|
DEV_PATH_ETHERNET = 0,
|
||||||
DEV_PATH_VLAN,
|
DEV_PATH_VLAN,
|
||||||
DEV_PATH_BRIDGE,
|
DEV_PATH_BRIDGE,
|
||||||
|
@ -7,7 +7,7 @@ Add .ndo_fill_forward_path for dsa slave port devices
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -838,6 +838,7 @@ enum net_device_path_type {
|
@@ -832,6 +832,7 @@ enum net_device_path_type {
|
||||||
DEV_PATH_VLAN,
|
DEV_PATH_VLAN,
|
||||||
DEV_PATH_BRIDGE,
|
DEV_PATH_BRIDGE,
|
||||||
DEV_PATH_PPPOE,
|
DEV_PATH_PPPOE,
|
||||||
@ -15,7 +15,7 @@ Add .ndo_fill_forward_path for dsa slave port devices
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct net_device_path {
|
struct net_device_path {
|
||||||
@@ -857,6 +858,10 @@ struct net_device_path {
|
@@ -851,6 +852,10 @@ struct net_device_path {
|
||||||
u16 vlan_id;
|
u16 vlan_id;
|
||||||
__be16 vlan_proto;
|
__be16 vlan_proto;
|
||||||
} bridge;
|
} bridge;
|
||||||
|
@ -27,7 +27,7 @@ Subject: [PATCH] netfilter: flowtable: add pppoe support
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -848,6 +848,7 @@ struct net_device_path {
|
@@ -842,6 +842,7 @@ struct net_device_path {
|
||||||
struct {
|
struct {
|
||||||
u16 id;
|
u16 id;
|
||||||
__be16 proto;
|
__be16 proto;
|
||||||
|
@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -855,6 +855,7 @@ struct net_device_path {
|
@@ -849,6 +849,7 @@ struct net_device_path {
|
||||||
DEV_PATH_BR_VLAN_KEEP,
|
DEV_PATH_BR_VLAN_KEEP,
|
||||||
DEV_PATH_BR_VLAN_TAG,
|
DEV_PATH_BR_VLAN_TAG,
|
||||||
DEV_PATH_BR_VLAN_UNTAG,
|
DEV_PATH_BR_VLAN_UNTAG,
|
||||||
|
@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
--- a/include/linux/netdevice.h
|
||||||
+++ b/include/linux/netdevice.h
|
+++ b/include/linux/netdevice.h
|
||||||
@@ -2033,6 +2033,8 @@ struct net_device {
|
@@ -2029,6 +2029,8 @@ struct net_device {
|
||||||
struct netdev_hw_addr_list mc;
|
struct netdev_hw_addr_list mc;
|
||||||
struct netdev_hw_addr_list dev_addrs;
|
struct netdev_hw_addr_list dev_addrs;
|
||||||
|
|
||||||
@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
__u16 tc_index; /* traffic control index */
|
__u16 tc_index; /* traffic control index */
|
||||||
--- a/net/core/dev.c
|
--- a/net/core/dev.c
|
||||||
+++ b/net/core/dev.c
|
+++ b/net/core/dev.c
|
||||||
@@ -5965,6 +5965,9 @@ static enum gro_result dev_gro_receive(s
|
@@ -6005,6 +6005,9 @@ static enum gro_result dev_gro_receive(s
|
||||||
int same_flow;
|
int same_flow;
|
||||||
int grow;
|
int grow;
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
if (netif_elide_gro(skb->dev))
|
if (netif_elide_gro(skb->dev))
|
||||||
goto normal;
|
goto normal;
|
||||||
|
|
||||||
@@ -7793,6 +7796,48 @@ static void __netdev_adjacent_dev_unlink
|
@@ -7973,6 +7976,48 @@ static void __netdev_adjacent_dev_unlink
|
||||||
&upper_dev->adj_list.lower);
|
&upper_dev->adj_list.lower);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
static int __netdev_upper_dev_link(struct net_device *dev,
|
static int __netdev_upper_dev_link(struct net_device *dev,
|
||||||
struct net_device *upper_dev, bool master,
|
struct net_device *upper_dev, bool master,
|
||||||
void *upper_priv, void *upper_info,
|
void *upper_priv, void *upper_info,
|
||||||
@@ -7844,6 +7889,7 @@ static int __netdev_upper_dev_link(struc
|
@@ -8024,6 +8069,7 @@ static int __netdev_upper_dev_link(struc
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
@ -99,7 +99,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
|
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
|
||||||
&changeupper_info.info);
|
&changeupper_info.info);
|
||||||
ret = notifier_to_errno(ret);
|
ret = notifier_to_errno(ret);
|
||||||
@@ -7940,6 +7986,7 @@ static void __netdev_upper_dev_unlink(st
|
@@ -8120,6 +8166,7 @@ static void __netdev_upper_dev_unlink(st
|
||||||
|
|
||||||
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
|
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||||||
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
|
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
|
||||||
&changeupper_info.info);
|
&changeupper_info.info);
|
||||||
|
|
||||||
@@ -8726,6 +8773,7 @@ int dev_set_mac_address(struct net_devic
|
@@ -8906,6 +8953,7 @@ int dev_set_mac_address(struct net_devic
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
dev->addr_assign_type = NET_ADDR_SET;
|
dev->addr_assign_type = NET_ADDR_SET;
|
||||||
|
@ -1,301 +0,0 @@
|
|||||||
From: Felix Fietkau <nbd@nbd.name>
|
|
||||||
Date: Sun, 26 Jul 2020 14:03:21 +0200
|
|
||||||
Subject: [PATCH] net: add support for threaded NAPI polling
|
|
||||||
|
|
||||||
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
|
|
||||||
poll function does not perform well. Since NAPI poll is bound to the CPU it
|
|
||||||
was scheduled from, we can easily end up with a few very busy CPUs spending
|
|
||||||
most of their time in softirq/ksoftirqd and some idle ones.
|
|
||||||
|
|
||||||
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
|
|
||||||
same except for using netif_threaded_napi_add instead of netif_napi_add.
|
|
||||||
|
|
||||||
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
|
|
||||||
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
|
|
||||||
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
|
|
||||||
thread.
|
|
||||||
|
|
||||||
With threaded NAPI, throughput seems stable and consistent (and higher than
|
|
||||||
the best results I got without it).
|
|
||||||
|
|
||||||
Based on a patch by Hillf Danton
|
|
||||||
|
|
||||||
Cc: Hillf Danton <hdanton@sina.com>
|
|
||||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|
||||||
---
|
|
||||||
|
|
||||||
--- a/include/linux/netdevice.h
|
|
||||||
+++ b/include/linux/netdevice.h
|
|
||||||
@@ -347,6 +347,7 @@ struct napi_struct {
|
|
||||||
struct list_head dev_list;
|
|
||||||
struct hlist_node napi_hash_node;
|
|
||||||
unsigned int napi_id;
|
|
||||||
+ struct work_struct work;
|
|
||||||
};
|
|
||||||
|
|
||||||
enum {
|
|
||||||
@@ -357,6 +358,7 @@ enum {
|
|
||||||
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
|
||||||
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
|
||||||
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
|
||||||
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
|
|
||||||
};
|
|
||||||
|
|
||||||
enum {
|
|
||||||
@@ -367,6 +369,7 @@ enum {
|
|
||||||
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
|
||||||
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
|
||||||
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
|
||||||
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
|
||||||
};
|
|
||||||
|
|
||||||
enum gro_result {
|
|
||||||
@@ -2211,6 +2214,7 @@ struct net_device {
|
|
||||||
struct lock_class_key *qdisc_running_key;
|
|
||||||
bool proto_down;
|
|
||||||
unsigned wol_enabled:1;
|
|
||||||
+ unsigned threaded:1;
|
|
||||||
|
|
||||||
struct list_head net_notifier_list;
|
|
||||||
|
|
||||||
@@ -2413,6 +2417,26 @@ void netif_napi_add(struct net_device *d
|
|
||||||
int (*poll)(struct napi_struct *, int), int weight);
|
|
||||||
|
|
||||||
/**
|
|
||||||
+ * netif_threaded_napi_add - initialize a NAPI context
|
|
||||||
+ * @dev: network device
|
|
||||||
+ * @napi: NAPI context
|
|
||||||
+ * @poll: polling function
|
|
||||||
+ * @weight: default weight
|
|
||||||
+ *
|
|
||||||
+ * This variant of netif_napi_add() should be used from drivers using NAPI
|
|
||||||
+ * with CPU intensive poll functions.
|
|
||||||
+ * This will schedule polling from a high priority workqueue
|
|
||||||
+ */
|
|
||||||
+static inline void netif_threaded_napi_add(struct net_device *dev,
|
|
||||||
+ struct napi_struct *napi,
|
|
||||||
+ int (*poll)(struct napi_struct *, int),
|
|
||||||
+ int weight)
|
|
||||||
+{
|
|
||||||
+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
||||||
+ netif_napi_add(dev, napi, poll, weight);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/**
|
|
||||||
* netif_tx_napi_add - initialize a NAPI context
|
|
||||||
* @dev: network device
|
|
||||||
* @napi: NAPI context
|
|
||||||
--- a/net/core/dev.c
|
|
||||||
+++ b/net/core/dev.c
|
|
||||||
@@ -159,6 +159,7 @@ static DEFINE_SPINLOCK(offload_lock);
|
|
||||||
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
|
|
||||||
struct list_head ptype_all __read_mostly; /* Taps */
|
|
||||||
static struct list_head offload_base __read_mostly;
|
|
||||||
+static struct workqueue_struct *napi_workq __read_mostly;
|
|
||||||
|
|
||||||
static int netif_rx_internal(struct sk_buff *skb);
|
|
||||||
static int call_netdevice_notifiers_info(unsigned long val,
|
|
||||||
@@ -6407,6 +6408,11 @@ void __napi_schedule(struct napi_struct
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
||||||
+ queue_work(napi_workq, &n->work);
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
local_irq_save(flags);
|
|
||||||
____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
||||||
local_irq_restore(flags);
|
|
||||||
@@ -6454,6 +6460,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
|
|
||||||
*/
|
|
||||||
void __napi_schedule_irqoff(struct napi_struct *n)
|
|
||||||
{
|
|
||||||
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
||||||
+ queue_work(napi_workq, &n->work);
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL(__napi_schedule_irqoff);
|
|
||||||
@@ -6715,12 +6726,94 @@ static void init_gro_hash(struct napi_st
|
|
||||||
napi->gro_bitmask = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
+static int __napi_poll(struct napi_struct *n, bool *repoll)
|
|
||||||
+{
|
|
||||||
+ int work, weight;
|
|
||||||
+
|
|
||||||
+ weight = n->weight;
|
|
||||||
+
|
|
||||||
+ /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
||||||
+ * with netpoll's poll_napi(). Only the entity which
|
|
||||||
+ * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
||||||
+ * actually make the ->poll() call. Therefore we avoid
|
|
||||||
+ * accidentally calling ->poll() when NAPI is not scheduled.
|
|
||||||
+ */
|
|
||||||
+ work = 0;
|
|
||||||
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
||||||
+ work = n->poll(n, weight);
|
|
||||||
+ trace_napi_poll(n, work, weight);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (unlikely(work > weight))
|
|
||||||
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
|
|
||||||
+ n->poll, work, weight);
|
|
||||||
+
|
|
||||||
+ if (likely(work < weight))
|
|
||||||
+ return work;
|
|
||||||
+
|
|
||||||
+ /* Drivers must not modify the NAPI state if they
|
|
||||||
+ * consume the entire weight. In such cases this code
|
|
||||||
+ * still "owns" the NAPI instance and therefore can
|
|
||||||
+ * move the instance around on the list at-will.
|
|
||||||
+ */
|
|
||||||
+ if (unlikely(napi_disable_pending(n))) {
|
|
||||||
+ napi_complete(n);
|
|
||||||
+ return work;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (n->gro_bitmask) {
|
|
||||||
+ /* flush too old packets
|
|
||||||
+ * If HZ < 1000, flush all packets.
|
|
||||||
+ */
|
|
||||||
+ napi_gro_flush(n, HZ >= 1000);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ gro_normal_list(n);
|
|
||||||
+
|
|
||||||
+ *repoll = true;
|
|
||||||
+
|
|
||||||
+ return work;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static void napi_workfn(struct work_struct *work)
|
|
||||||
+{
|
|
||||||
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
|
|
||||||
+ void *have;
|
|
||||||
+
|
|
||||||
+ for (;;) {
|
|
||||||
+ bool repoll = false;
|
|
||||||
+
|
|
||||||
+ local_bh_disable();
|
|
||||||
+
|
|
||||||
+ have = netpoll_poll_lock(n);
|
|
||||||
+ __napi_poll(n, &repoll);
|
|
||||||
+ netpoll_poll_unlock(have);
|
|
||||||
+
|
|
||||||
+ local_bh_enable();
|
|
||||||
+
|
|
||||||
+ if (!repoll)
|
|
||||||
+ return;
|
|
||||||
+
|
|
||||||
+ if (!need_resched())
|
|
||||||
+ continue;
|
|
||||||
+
|
|
||||||
+ /*
|
|
||||||
+ * have to pay for the latency of task switch even if
|
|
||||||
+ * napi is scheduled
|
|
||||||
+ */
|
|
||||||
+ queue_work(napi_workq, work);
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
|
|
||||||
int (*poll)(struct napi_struct *, int), int weight)
|
|
||||||
{
|
|
||||||
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
|
|
||||||
return;
|
|
||||||
|
|
||||||
+ if (dev->threaded)
|
|
||||||
+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
||||||
INIT_LIST_HEAD(&napi->poll_list);
|
|
||||||
INIT_HLIST_NODE(&napi->napi_hash_node);
|
|
||||||
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
||||||
@@ -6738,6 +6831,7 @@ void netif_napi_add(struct net_device *d
|
|
||||||
#ifdef CONFIG_NETPOLL
|
|
||||||
napi->poll_owner = -1;
|
|
||||||
#endif
|
|
||||||
+ INIT_WORK(&napi->work, napi_workfn);
|
|
||||||
set_bit(NAPI_STATE_SCHED, &napi->state);
|
|
||||||
set_bit(NAPI_STATE_NPSVC, &napi->state);
|
|
||||||
list_add_rcu(&napi->dev_list, &dev->napi_list);
|
|
||||||
@@ -6780,6 +6874,7 @@ void __netif_napi_del(struct napi_struct
|
|
||||||
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
|
|
||||||
return;
|
|
||||||
|
|
||||||
+ cancel_work_sync(&napi->work);
|
|
||||||
napi_hash_del(napi);
|
|
||||||
list_del_rcu(&napi->dev_list);
|
|
||||||
napi_free_frags(napi);
|
|
||||||
@@ -6791,53 +6886,19 @@ EXPORT_SYMBOL(__netif_napi_del);
|
|
||||||
|
|
||||||
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
|
||||||
{
|
|
||||||
+ bool do_repoll = false;
|
|
||||||
void *have;
|
|
||||||
- int work, weight;
|
|
||||||
+ int work;
|
|
||||||
|
|
||||||
list_del_init(&n->poll_list);
|
|
||||||
|
|
||||||
have = netpoll_poll_lock(n);
|
|
||||||
|
|
||||||
- weight = n->weight;
|
|
||||||
+ work = __napi_poll(n, &do_repoll);
|
|
||||||
|
|
||||||
- /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
||||||
- * with netpoll's poll_napi(). Only the entity which
|
|
||||||
- * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
||||||
- * actually make the ->poll() call. Therefore we avoid
|
|
||||||
- * accidentally calling ->poll() when NAPI is not scheduled.
|
|
||||||
- */
|
|
||||||
- work = 0;
|
|
||||||
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
||||||
- work = n->poll(n, weight);
|
|
||||||
- trace_napi_poll(n, work, weight);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if (unlikely(work > weight))
|
|
||||||
- pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
|
|
||||||
- n->poll, work, weight);
|
|
||||||
-
|
|
||||||
- if (likely(work < weight))
|
|
||||||
+ if (!do_repoll)
|
|
||||||
goto out_unlock;
|
|
||||||
|
|
||||||
- /* Drivers must not modify the NAPI state if they
|
|
||||||
- * consume the entire weight. In such cases this code
|
|
||||||
- * still "owns" the NAPI instance and therefore can
|
|
||||||
- * move the instance around on the list at-will.
|
|
||||||
- */
|
|
||||||
- if (unlikely(napi_disable_pending(n))) {
|
|
||||||
- napi_complete(n);
|
|
||||||
- goto out_unlock;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if (n->gro_bitmask) {
|
|
||||||
- /* flush too old packets
|
|
||||||
- * If HZ < 1000, flush all packets.
|
|
||||||
- */
|
|
||||||
- napi_gro_flush(n, HZ >= 1000);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- gro_normal_list(n);
|
|
||||||
-
|
|
||||||
/* Some drivers may have called napi_schedule
|
|
||||||
* prior to exhausting their budget.
|
|
||||||
*/
|
|
||||||
@@ -11333,6 +11394,10 @@ static int __init net_dev_init(void)
|
|
||||||
sd->backlog.weight = weight_p;
|
|
||||||
}
|
|
||||||
|
|
||||||
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
|
|
||||||
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
|
|
||||||
+ BUG_ON(!napi_workq);
|
|
||||||
+
|
|
||||||
dev_boot_phase = 0;
|
|
||||||
|
|
||||||
/* The loopback device is special if any other network devices
|
|
@ -1,74 +0,0 @@
|
|||||||
From: Felix Fietkau <nbd@nbd.name>
|
|
||||||
Date: Fri, 21 Aug 2020 15:07:54 +0200
|
|
||||||
Subject: [PATCH] net: add sysfs attribute for enabling threaded NAPI
|
|
||||||
|
|
||||||
This can be used to enable threaded NAPI on drivers that did not explicitly
|
|
||||||
request it.
|
|
||||||
|
|
||||||
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
|
|
||||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|
||||||
---
|
|
||||||
|
|
||||||
--- a/net/core/net-sysfs.c
|
|
||||||
+++ b/net/core/net-sysfs.c
|
|
||||||
@@ -472,6 +472,52 @@ static ssize_t proto_down_store(struct d
|
|
||||||
}
|
|
||||||
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
|
|
||||||
|
|
||||||
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
|
|
||||||
+{
|
|
||||||
+ struct napi_struct *napi;
|
|
||||||
+
|
|
||||||
+ if (list_empty(&dev->napi_list))
|
|
||||||
+ return -EOPNOTSUPP;
|
|
||||||
+
|
|
||||||
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
|
||||||
+ if (val)
|
|
||||||
+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
||||||
+ else
|
|
||||||
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static ssize_t napi_threaded_store(struct device *dev,
|
|
||||||
+ struct device_attribute *attr,
|
|
||||||
+ const char *buf, size_t len)
|
|
||||||
+{
|
|
||||||
+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static ssize_t napi_threaded_show(struct device *dev,
|
|
||||||
+ struct device_attribute *attr,
|
|
||||||
+ char *buf)
|
|
||||||
+{
|
|
||||||
+ struct net_device *netdev = to_net_dev(dev);
|
|
||||||
+ struct napi_struct *napi;
|
|
||||||
+ bool enabled = false;
|
|
||||||
+
|
|
||||||
+ if (!rtnl_trylock())
|
|
||||||
+ return restart_syscall();
|
|
||||||
+
|
|
||||||
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
|
|
||||||
+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
|
|
||||||
+ enabled = true;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ rtnl_unlock();
|
|
||||||
+
|
|
||||||
+ return sprintf(buf, fmt_dec, enabled);
|
|
||||||
+}
|
|
||||||
+static DEVICE_ATTR_RW(napi_threaded);
|
|
||||||
+
|
|
||||||
static ssize_t phys_port_id_show(struct device *dev,
|
|
||||||
struct device_attribute *attr, char *buf)
|
|
||||||
{
|
|
||||||
@@ -564,6 +610,7 @@ static struct attribute *net_class_attrs
|
|
||||||
&dev_attr_tx_queue_len.attr,
|
|
||||||
&dev_attr_gro_flush_timeout.attr,
|
|
||||||
&dev_attr_napi_defer_hard_irqs.attr,
|
|
||||||
+ &dev_attr_napi_threaded.attr,
|
|
||||||
&dev_attr_phys_port_id.attr,
|
|
||||||
&dev_attr_phys_port_name.attr,
|
|
||||||
&dev_attr_phys_switch_id.attr,
|
|
Loading…
x
Reference in New Issue
Block a user