2020-10-24 19:14:16 +00:00
|
|
|
From: Felix Fietkau <nbd@nbd.name>
|
|
|
|
Date: Sun, 26 Jul 2020 14:03:21 +0200
|
|
|
|
Subject: [PATCH] net: add support for threaded NAPI polling
|
|
|
|
|
|
|
|
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
|
|
|
|
poll function does not perform well. Since NAPI poll is bound to the CPU it
|
|
|
|
was scheduled from, we can easily end up with a few very busy CPUs spending
|
|
|
|
most of their time in softirq/ksoftirqd and some idle ones.
|
|
|
|
|
|
|
|
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
|
|
|
|
same except for using netif_threaded_napi_add instead of netif_napi_add.
|
|
|
|
|
|
|
|
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
|
|
|
|
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
|
|
|
|
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
|
|
|
|
thread.
|
|
|
|
|
|
|
|
With threaded NAPI, throughput seems stable and consistent (and higher than
|
|
|
|
the best results I got without it).
|
|
|
|
|
|
|
|
Based on a patch by Hillf Danton
|
|
|
|
|
|
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
|
|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|
|
|
---
|
|
|
|
|
|
|
|
--- a/include/linux/netdevice.h
|
|
|
|
+++ b/include/linux/netdevice.h
|
|
|
|
@@ -347,6 +347,7 @@ struct napi_struct {
|
|
|
|
struct list_head dev_list;
|
|
|
|
struct hlist_node napi_hash_node;
|
|
|
|
unsigned int napi_id;
|
|
|
|
+ struct work_struct work;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
@@ -357,6 +358,7 @@ enum {
|
|
|
|
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
|
|
|
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
|
|
|
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
|
|
|
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
@@ -367,6 +369,7 @@ enum {
|
|
|
|
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
|
|
|
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
|
|
|
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
|
|
|
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
|
|
|
};
|
|
|
|
|
|
|
|
enum gro_result {
|
2021-03-08 16:20:20 +00:00
|
|
|
@@ -2413,6 +2416,26 @@ void netif_napi_add(struct net_device *d
|
2020-10-24 19:14:16 +00:00
|
|
|
int (*poll)(struct napi_struct *, int), int weight);
|
|
|
|
|
|
|
|
/**
|
|
|
|
+ * netif_threaded_napi_add - initialize a NAPI context
|
|
|
|
+ * @dev: network device
|
|
|
|
+ * @napi: NAPI context
|
|
|
|
+ * @poll: polling function
|
|
|
|
+ * @weight: default weight
|
|
|
|
+ *
|
|
|
|
+ * This variant of netif_napi_add() should be used from drivers using NAPI
|
|
|
|
+ * with CPU intensive poll functions.
|
|
|
|
+ * This will schedule polling from a high priority workqueue
|
|
|
|
+ */
|
|
|
|
+static inline void netif_threaded_napi_add(struct net_device *dev,
|
|
|
|
+ struct napi_struct *napi,
|
|
|
|
+ int (*poll)(struct napi_struct *, int),
|
|
|
|
+ int weight)
|
|
|
|
+{
|
|
|
|
+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
|
|
+ netif_napi_add(dev, napi, poll, weight);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
* netif_tx_napi_add - initialize a NAPI context
|
|
|
|
* @dev: network device
|
|
|
|
* @napi: NAPI context
|
|
|
|
--- a/net/core/dev.c
|
|
|
|
+++ b/net/core/dev.c
|
|
|
|
@@ -159,6 +159,7 @@ static DEFINE_SPINLOCK(offload_lock);
|
|
|
|
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
|
|
|
|
struct list_head ptype_all __read_mostly; /* Taps */
|
|
|
|
static struct list_head offload_base __read_mostly;
|
|
|
|
+static struct workqueue_struct *napi_workq __read_mostly;
|
|
|
|
|
|
|
|
static int netif_rx_internal(struct sk_buff *skb);
|
|
|
|
static int call_netdevice_notifiers_info(unsigned long val,
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6407,6 +6408,11 @@ void __napi_schedule(struct napi_struct
|
2020-10-24 19:14:16 +00:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
|
|
+ queue_work(napi_workq, &n->work);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
local_irq_save(flags);
|
|
|
|
____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
|
|
local_irq_restore(flags);
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6454,6 +6460,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
|
2020-10-24 19:14:16 +00:00
|
|
|
*/
|
|
|
|
void __napi_schedule_irqoff(struct napi_struct *n)
|
|
|
|
{
|
|
|
|
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
|
|
+ queue_work(napi_workq, &n->work);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__napi_schedule_irqoff);
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6715,6 +6726,86 @@ static void init_gro_hash(struct napi_st
|
2020-10-24 19:14:16 +00:00
|
|
|
napi->gro_bitmask = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
+static int __napi_poll(struct napi_struct *n, bool *repoll)
|
|
|
|
+{
|
|
|
|
+ int work, weight;
|
|
|
|
+
|
|
|
|
+ weight = n->weight;
|
|
|
|
+
|
|
|
|
+ /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
|
|
+ * with netpoll's poll_napi(). Only the entity which
|
|
|
|
+ * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
|
|
+ * actually make the ->poll() call. Therefore we avoid
|
|
|
|
+ * accidentally calling ->poll() when NAPI is not scheduled.
|
|
|
|
+ */
|
|
|
|
+ work = 0;
|
|
|
|
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
|
|
+ work = n->poll(n, weight);
|
|
|
|
+ trace_napi_poll(n, work, weight);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (unlikely(work > weight))
|
|
|
|
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
|
|
|
|
+ n->poll, work, weight);
|
|
|
|
+
|
|
|
|
+ if (likely(work < weight))
|
|
|
|
+ return work;
|
|
|
|
+
|
|
|
|
+ /* Drivers must not modify the NAPI state if they
|
|
|
|
+ * consume the entire weight. In such cases this code
|
|
|
|
+ * still "owns" the NAPI instance and therefore can
|
|
|
|
+ * move the instance around on the list at-will.
|
|
|
|
+ */
|
|
|
|
+ if (unlikely(napi_disable_pending(n))) {
|
|
|
|
+ napi_complete(n);
|
|
|
|
+ return work;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (n->gro_bitmask) {
|
|
|
|
+ /* flush too old packets
|
|
|
|
+ * If HZ < 1000, flush all packets.
|
|
|
|
+ */
|
|
|
|
+ napi_gro_flush(n, HZ >= 1000);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ gro_normal_list(n);
|
|
|
|
+
|
|
|
|
+ *repoll = true;
|
|
|
|
+
|
|
|
|
+ return work;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void napi_workfn(struct work_struct *work)
|
|
|
|
+{
|
|
|
|
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
|
|
|
|
+ void *have;
|
|
|
|
+
|
|
|
|
+ for (;;) {
|
|
|
|
+ bool repoll = false;
|
|
|
|
+
|
|
|
|
+ local_bh_disable();
|
|
|
|
+
|
|
|
|
+ have = netpoll_poll_lock(n);
|
|
|
|
+ __napi_poll(n, &repoll);
|
|
|
|
+ netpoll_poll_unlock(have);
|
|
|
|
+
|
|
|
|
+ local_bh_enable();
|
|
|
|
+
|
|
|
|
+ if (!repoll)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ if (!need_resched())
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * have to pay for the latency of task switch even if
|
|
|
|
+ * napi is scheduled
|
|
|
|
+ */
|
|
|
|
+ queue_work(napi_workq, work);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
|
|
|
|
int (*poll)(struct napi_struct *, int), int weight)
|
|
|
|
{
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6738,6 +6829,7 @@ void netif_napi_add(struct net_device *d
|
2020-10-24 19:14:16 +00:00
|
|
|
#ifdef CONFIG_NETPOLL
|
|
|
|
napi->poll_owner = -1;
|
|
|
|
#endif
|
|
|
|
+ INIT_WORK(&napi->work, napi_workfn);
|
|
|
|
set_bit(NAPI_STATE_SCHED, &napi->state);
|
|
|
|
set_bit(NAPI_STATE_NPSVC, &napi->state);
|
|
|
|
list_add_rcu(&napi->dev_list, &dev->napi_list);
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6780,6 +6872,7 @@ void __netif_napi_del(struct napi_struct
|
2020-10-24 19:14:16 +00:00
|
|
|
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
|
|
|
|
return;
|
|
|
|
|
|
|
|
+ cancel_work_sync(&napi->work);
|
|
|
|
napi_hash_del(napi);
|
|
|
|
list_del_rcu(&napi->dev_list);
|
|
|
|
napi_free_frags(napi);
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -6791,53 +6884,19 @@ EXPORT_SYMBOL(__netif_napi_del);
|
2020-10-24 19:14:16 +00:00
|
|
|
|
|
|
|
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
|
|
|
{
|
|
|
|
+ bool do_repoll = false;
|
|
|
|
void *have;
|
|
|
|
- int work, weight;
|
|
|
|
+ int work;
|
|
|
|
|
|
|
|
list_del_init(&n->poll_list);
|
|
|
|
|
|
|
|
have = netpoll_poll_lock(n);
|
|
|
|
|
|
|
|
- weight = n->weight;
|
2021-02-16 22:39:32 +00:00
|
|
|
+ work = __napi_poll(n, &do_repoll);
|
|
|
|
|
2020-10-24 19:14:16 +00:00
|
|
|
- /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
|
|
- * with netpoll's poll_napi(). Only the entity which
|
|
|
|
- * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
|
|
- * actually make the ->poll() call. Therefore we avoid
|
|
|
|
- * accidentally calling ->poll() when NAPI is not scheduled.
|
|
|
|
- */
|
|
|
|
- work = 0;
|
|
|
|
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
|
|
- work = n->poll(n, weight);
|
|
|
|
- trace_napi_poll(n, work, weight);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (unlikely(work > weight))
|
|
|
|
- pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
|
|
|
|
- n->poll, work, weight);
|
|
|
|
-
|
|
|
|
- if (likely(work < weight))
|
2021-02-16 22:39:32 +00:00
|
|
|
+ if (!do_repoll)
|
|
|
|
goto out_unlock;
|
2020-10-24 19:14:16 +00:00
|
|
|
|
|
|
|
- /* Drivers must not modify the NAPI state if they
|
|
|
|
- * consume the entire weight. In such cases this code
|
|
|
|
- * still "owns" the NAPI instance and therefore can
|
|
|
|
- * move the instance around on the list at-will.
|
|
|
|
- */
|
|
|
|
- if (unlikely(napi_disable_pending(n))) {
|
|
|
|
- napi_complete(n);
|
2021-02-16 22:39:32 +00:00
|
|
|
- goto out_unlock;
|
2020-10-24 19:14:16 +00:00
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (n->gro_bitmask) {
|
|
|
|
- /* flush too old packets
|
|
|
|
- * If HZ < 1000, flush all packets.
|
|
|
|
- */
|
|
|
|
- napi_gro_flush(n, HZ >= 1000);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- gro_normal_list(n);
|
2021-02-16 22:39:32 +00:00
|
|
|
-
|
2020-10-24 19:14:16 +00:00
|
|
|
/* Some drivers may have called napi_schedule
|
|
|
|
* prior to exhausting their budget.
|
2021-02-16 22:39:32 +00:00
|
|
|
*/
|
2021-02-23 17:29:49 +00:00
|
|
|
@@ -11291,6 +11350,10 @@ static int __init net_dev_init(void)
|
2020-10-24 19:14:16 +00:00
|
|
|
sd->backlog.weight = weight_p;
|
|
|
|
}
|
|
|
|
|
|
|
|
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
|
|
|
|
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
|
|
|
|
+ BUG_ON(!napi_workq);
|
|
|
|
+
|
|
|
|
dev_boot_phase = 0;
|
|
|
|
|
|
|
|
/* The loopback device is special if any other network devices
|