Skip to content

Commit 3b47d30

Browse files
edumazetdavem330
authored andcommitted
net: gro: add a per device gro flush timer
Tuning coalescing parameters on NIC can be really hard. Servers can handle both bulk and RPC like traffic, with conflicting goals : bulk flows want as big GRO packets as possible, RPC want minimal latencies. To reach big GRO packets on 10Gbe NIC, one can use : ethtool -C eth0 rx-usecs 4 rx-frames 44 But this penalizes rpc sessions, with an increase of latencies, up to 50% in some cases, as NICs generally do not force an interrupt when a packet with TCP Push flag is received. Some NICs do not have an absolute timer, only a timer rearmed for every incoming packet. This patch uses a different strategy : Let GRO stack decides what do do, based on traffic pattern. Packets with Push flag wont be delayed. Packets without Push flag might be held in GRO engine, if we keep receiving data. This new mechanism is off by default, and shall be enabled by setting /sys/class/net/ethX/gro_flush_timeout to a value in nanosecond. To fully enable this mechanism, drivers should use napi_complete_done() instead of napi_complete(). Tested: Ran 200 netperf TCP_STREAM from A to B (10Gbe mlx4 link, 8 RX queues) Without this feature, we send back about 305,000 ACK per second. GRO aggregation ratio is low (811/305 = 2.65 segments per GRO packet) Setting a timer of 2000 nsec is enough to increase GRO packet sizes and reduce number of ACK packets. (811/19.2 = 42) Receiver performs less calls to upper stacks, less wakes up. This also reduces cpu usage on the sender, as it receives less ACK packets. Note that reducing number of wakes up increases cpu efficiency, but can decrease QPS, as applications wont have the chance to warmup cpu caches doing a partial read of RPC requests/answers if they fit in one skb. B:~# sar -n DEV 1 10 | grep eth0 | tail -1 Average: eth0 811269.80 305732.30 1199462.57 19705.72 0.00 0.00 0.50 B:~# echo 2000 >/sys/class/net/eth0/gro_flush_timeout B:~# sar -n DEV 1 10 | grep eth0 | tail -1 Average: eth0 811577.30 19230.80 1199916.51 1239.80 0.00 0.00 0.50 Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent be955b2 commit 3b47d30

File tree

3 files changed

+69
-14
lines changed

3 files changed

+69
-14
lines changed

include/linux/netdevice.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ struct napi_struct {
314314
struct net_device *dev;
315315
struct sk_buff *gro_list;
316316
struct sk_buff *skb;
317+
struct hrtimer timer;
317318
struct list_head dev_list;
318319
struct hlist_node napi_hash_node;
319320
unsigned int napi_id;
@@ -443,14 +444,19 @@ static inline bool napi_reschedule(struct napi_struct *napi)
443444
return false;
444445
}
445446

447+
void __napi_complete(struct napi_struct *n);
448+
void napi_complete_done(struct napi_struct *n, int work_done);
446449
/**
447450
* napi_complete - NAPI processing complete
448451
* @n: napi context
449452
*
450453
* Mark NAPI processing as complete.
454+
* Consider using napi_complete_done() instead.
451455
*/
452-
void __napi_complete(struct napi_struct *n);
453-
void napi_complete(struct napi_struct *n);
456+
static inline void napi_complete(struct napi_struct *n)
457+
{
458+
return napi_complete_done(n, 0);
459+
}
454460

455461
/**
456462
* napi_by_id - lookup a NAPI by napi_id
@@ -485,14 +491,7 @@ void napi_hash_del(struct napi_struct *napi);
485491
* Stop NAPI from being scheduled on this context.
486492
* Waits till any outstanding processing completes.
487493
*/
488-
static inline void napi_disable(struct napi_struct *n)
489-
{
490-
might_sleep();
491-
set_bit(NAPI_STATE_DISABLE, &n->state);
492-
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
493-
msleep(1);
494-
clear_bit(NAPI_STATE_DISABLE, &n->state);
495-
}
494+
void napi_disable(struct napi_struct *n);
496495

497496
/**
498497
* napi_enable - enable NAPI scheduling
@@ -1603,6 +1602,7 @@ struct net_device {
16031602

16041603
#endif
16051604

1605+
unsigned long gro_flush_timeout;
16061606
rx_handler_func_t __rcu *rx_handler;
16071607
void __rcu *rx_handler_data;
16081608

net/core/dev.c

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134
#include <linux/vmalloc.h>
135135
#include <linux/if_macvlan.h>
136136
#include <linux/errqueue.h>
137+
#include <linux/hrtimer.h>
137138

138139
#include "net-sysfs.h"
139140

@@ -4412,15 +4413,14 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
44124413
void __napi_complete(struct napi_struct *n)
44134414
{
44144415
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4415-
BUG_ON(n->gro_list);
44164416

44174417
list_del_init(&n->poll_list);
44184418
smp_mb__before_atomic();
44194419
clear_bit(NAPI_STATE_SCHED, &n->state);
44204420
}
44214421
EXPORT_SYMBOL(__napi_complete);
44224422

4423-
void napi_complete(struct napi_struct *n)
4423+
void napi_complete_done(struct napi_struct *n, int work_done)
44244424
{
44254425
unsigned long flags;
44264426

@@ -4431,8 +4431,18 @@ void napi_complete(struct napi_struct *n)
44314431
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
44324432
return;
44334433

4434-
napi_gro_flush(n, false);
4434+
if (n->gro_list) {
4435+
unsigned long timeout = 0;
44354436

4437+
if (work_done)
4438+
timeout = n->dev->gro_flush_timeout;
4439+
4440+
if (timeout)
4441+
hrtimer_start(&n->timer, ns_to_ktime(timeout),
4442+
HRTIMER_MODE_REL_PINNED);
4443+
else
4444+
napi_gro_flush(n, false);
4445+
}
44364446
if (likely(list_empty(&n->poll_list))) {
44374447
WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
44384448
} else {
@@ -4442,7 +4452,7 @@ void napi_complete(struct napi_struct *n)
44424452
local_irq_restore(flags);
44434453
}
44444454
}
4445-
EXPORT_SYMBOL(napi_complete);
4455+
EXPORT_SYMBOL(napi_complete_done);
44464456

44474457
/* must be called under rcu_read_lock(), as we dont take a reference */
44484458
struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4496,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
44964506
}
44974507
EXPORT_SYMBOL_GPL(napi_hash_del);
44984508

4509+
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4510+
{
4511+
struct napi_struct *napi;
4512+
4513+
napi = container_of(timer, struct napi_struct, timer);
4514+
if (napi->gro_list)
4515+
napi_schedule(napi);
4516+
4517+
return HRTIMER_NORESTART;
4518+
}
4519+
44994520
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
45004521
int (*poll)(struct napi_struct *, int), int weight)
45014522
{
45024523
INIT_LIST_HEAD(&napi->poll_list);
4524+
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4525+
napi->timer.function = napi_watchdog;
45034526
napi->gro_count = 0;
45044527
napi->gro_list = NULL;
45054528
napi->skb = NULL;
@@ -4518,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
45184541
}
45194542
EXPORT_SYMBOL(netif_napi_add);
45204543

4544+
void napi_disable(struct napi_struct *n)
4545+
{
4546+
might_sleep();
4547+
set_bit(NAPI_STATE_DISABLE, &n->state);
4548+
4549+
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4550+
msleep(1);
4551+
4552+
hrtimer_cancel(&n->timer);
4553+
4554+
clear_bit(NAPI_STATE_DISABLE, &n->state);
4555+
}
4556+
EXPORT_SYMBOL(napi_disable);
4557+
45214558
void netif_napi_del(struct napi_struct *napi)
45224559
{
45234560
list_del_init(&napi->dev_list);

net/core/net-sysfs.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
325325
}
326326
NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
327327

328+
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
329+
{
330+
dev->gro_flush_timeout = val;
331+
return 0;
332+
}
333+
334+
static ssize_t gro_flush_timeout_store(struct device *dev,
335+
struct device_attribute *attr,
336+
const char *buf, size_t len)
337+
{
338+
if (!capable(CAP_NET_ADMIN))
339+
return -EPERM;
340+
341+
return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
342+
}
343+
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
344+
328345
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
329346
const char *buf, size_t len)
330347
{
@@ -422,6 +439,7 @@ static struct attribute *net_class_attrs[] = {
422439
&dev_attr_mtu.attr,
423440
&dev_attr_flags.attr,
424441
&dev_attr_tx_queue_len.attr,
442+
&dev_attr_gro_flush_timeout.attr,
425443
&dev_attr_phys_port_id.attr,
426444
NULL,
427445
};

0 commit comments

Comments
 (0)