Skip to content

Commit 6f8b12d

Browse files
edumazetdavem330
authored andcommitted
net: napi: add hard irqs deferral feature
Back in commit 3b47d30 ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b605 ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e6acd2b commit 6f8b12d

File tree

3 files changed

+38
-11
lines changed

3 files changed

+38
-11
lines changed

include/linux/netdevice.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ struct napi_struct {
329329

330330
unsigned long state;
331331
int weight;
332+
int defer_hard_irqs_count;
332333
unsigned long gro_bitmask;
333334
int (*poll)(struct napi_struct *, int);
334335
#ifdef CONFIG_NETPOLL
@@ -1995,6 +1996,7 @@ struct net_device {
19951996

19961997
struct bpf_prog __rcu *xdp_prog;
19971998
unsigned long gro_flush_timeout;
1999+
int napi_defer_hard_irqs;
19982000
rx_handler_func_t __rcu *rx_handler;
19992001
void __rcu *rx_handler_data;
20002002

net/core/dev.c

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
62276227

62286228
bool napi_complete_done(struct napi_struct *n, int work_done)
62296229
{
6230-
unsigned long flags, val, new;
6230+
unsigned long flags, val, new, timeout = 0;
6231+
bool ret = true;
62316232

62326233
/*
62336234
* 1) Don't let napi dequeue from the cpu poll list
@@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
62396240
NAPIF_STATE_IN_BUSY_POLL)))
62406241
return false;
62416242

6242-
if (n->gro_bitmask) {
6243-
unsigned long timeout = 0;
6244-
6245-
if (work_done)
6243+
if (work_done) {
6244+
if (n->gro_bitmask)
62466245
timeout = n->dev->gro_flush_timeout;
6247-
6246+
n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs;
6247+
}
6248+
if (n->defer_hard_irqs_count > 0) {
6249+
n->defer_hard_irqs_count--;
6250+
timeout = n->dev->gro_flush_timeout;
6251+
if (timeout)
6252+
ret = false;
6253+
}
6254+
if (n->gro_bitmask) {
62486255
/* When the NAPI instance uses a timeout and keeps postponing
62496256
* it, we need to bound somehow the time packets are kept in
62506257
* the GRO layer
62516258
*/
62526259
napi_gro_flush(n, !!timeout);
6253-
if (timeout)
6254-
hrtimer_start(&n->timer, ns_to_ktime(timeout),
6255-
HRTIMER_MODE_REL_PINNED);
62566260
}
62576261

62586262
gro_normal_list(n);
@@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
62846288
return false;
62856289
}
62866290

6287-
return true;
6291+
if (timeout)
6292+
hrtimer_start(&n->timer, ns_to_ktime(timeout),
6293+
HRTIMER_MODE_REL_PINNED);
6294+
return ret;
62886295
}
62896296
EXPORT_SYMBOL(napi_complete_done);
62906297

@@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
64646471
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
64656472
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
64666473
*/
6467-
if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6474+
if (!napi_disable_pending(napi) &&
64686475
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
64696476
__napi_schedule_irqoff(napi);
64706477

net/core/net-sysfs.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
382382
}
383383
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
384384

385+
static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
386+
{
387+
dev->napi_defer_hard_irqs = val;
388+
return 0;
389+
}
390+
391+
static ssize_t napi_defer_hard_irqs_store(struct device *dev,
392+
struct device_attribute *attr,
393+
const char *buf, size_t len)
394+
{
395+
if (!capable(CAP_NET_ADMIN))
396+
return -EPERM;
397+
398+
return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
399+
}
400+
NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
401+
385402
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
386403
const char *buf, size_t len)
387404
{
@@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
545562
&dev_attr_flags.attr,
546563
&dev_attr_tx_queue_len.attr,
547564
&dev_attr_gro_flush_timeout.attr,
565+
&dev_attr_napi_defer_hard_irqs.attr,
548566
&dev_attr_phys_port_id.attr,
549567
&dev_attr_phys_port_name.attr,
550568
&dev_attr_phys_switch_id.attr,

0 commit comments

Comments
 (0)