Skip to content

Commit 3a54654

Browse files
Frederic WeisbeckerPeter Zijlstra
authored andcommitted
perf: Fix event leak upon exec and file release
The perf pending task work is never waited upon the matching event release. In the case of a child event, released via free_event() directly, this can potentially result in a leaked event, such as in the following scenario that doesn't even require a weak IRQ work implementation to trigger: schedule() prepare_task_switch() =======> <NMI> perf_event_overflow() event->pending_sigtrap = ... irq_work_queue(&event->pending_irq) <======= </NMI> perf_event_task_sched_out() event_sched_out() event->pending_sigtrap = 0; atomic_long_inc_not_zero(&event->refcount) task_work_add(&event->pending_task) finish_lock_switch() =======> <IRQ> perf_pending_irq() //do nothing, rely on pending task work <======= </IRQ> begin_new_exec() perf_event_exit_task() perf_event_exit_event() // If is child event free_event() WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1) // event is leaked Similar scenarios can also happen with perf_event_remove_on_exec() or simply against concurrent perf_event_release(). Fix this with synchonizing against the possibly remaining pending task work while freeing the event, just like is done with remaining pending IRQ work. This means that the pending task callback neither need nor should hold a reference to the event, preventing it from ever beeing freed. Fixes: 517e6a3 ("perf: Fix perf_pending_task() UaF") Signed-off-by: Frederic Weisbecker <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected]
1 parent 2fd5ad3 commit 3a54654

File tree

2 files changed

+35
-4
lines changed

2 files changed

+35
-4
lines changed

include/linux/perf_event.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,7 @@ struct perf_event {
786786
struct irq_work pending_irq;
787787
struct callback_head pending_task;
788788
unsigned int pending_work;
789+
struct rcuwait pending_work_wait;
789790

790791
atomic_t event_limit;
791792

kernel/events/core.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2288,7 +2288,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
22882288
if (state != PERF_EVENT_STATE_OFF &&
22892289
!event->pending_work &&
22902290
!task_work_add(current, &event->pending_task, TWA_RESUME)) {
2291-
WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
22922291
event->pending_work = 1;
22932292
} else {
22942293
local_dec(&event->ctx->nr_pending);
@@ -5203,9 +5202,35 @@ static bool exclusive_event_installable(struct perf_event *event,
52035202
static void perf_addr_filters_splice(struct perf_event *event,
52045203
struct list_head *head);
52055204

5205+
static void perf_pending_task_sync(struct perf_event *event)
5206+
{
5207+
struct callback_head *head = &event->pending_task;
5208+
5209+
if (!event->pending_work)
5210+
return;
5211+
/*
5212+
* If the task is queued to the current task's queue, we
5213+
* obviously can't wait for it to complete. Simply cancel it.
5214+
*/
5215+
if (task_work_cancel(current, head)) {
5216+
event->pending_work = 0;
5217+
local_dec(&event->ctx->nr_pending);
5218+
return;
5219+
}
5220+
5221+
/*
5222+
* All accesses related to the event are within the same
5223+
* non-preemptible section in perf_pending_task(). The RCU
5224+
* grace period before the event is freed will make sure all
5225+
* those accesses are complete by then.
5226+
*/
5227+
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
5228+
}
5229+
52065230
static void _free_event(struct perf_event *event)
52075231
{
52085232
irq_work_sync(&event->pending_irq);
5233+
perf_pending_task_sync(event);
52095234

52105235
unaccount_event(event);
52115236

@@ -6817,24 +6842,28 @@ static void perf_pending_task(struct callback_head *head)
68176842
struct perf_event *event = container_of(head, struct perf_event, pending_task);
68186843
int rctx;
68196844

6845+
/*
6846+
* All accesses to the event must belong to the same implicit RCU read-side
6847+
* critical section as the ->pending_work reset. See comment in
6848+
* perf_pending_task_sync().
6849+
*/
6850+
preempt_disable_notrace();
68206851
/*
68216852
* If we 'fail' here, that's OK, it means recursion is already disabled
68226853
* and we won't recurse 'further'.
68236854
*/
6824-
preempt_disable_notrace();
68256855
rctx = perf_swevent_get_recursion_context();
68266856

68276857
if (event->pending_work) {
68286858
event->pending_work = 0;
68296859
perf_sigtrap(event);
68306860
local_dec(&event->ctx->nr_pending);
6861+
rcuwait_wake_up(&event->pending_work_wait);
68316862
}
68326863

68336864
if (rctx >= 0)
68346865
perf_swevent_put_recursion_context(rctx);
68356866
preempt_enable_notrace();
6836-
6837-
put_event(event);
68386867
}
68396868

68406869
#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -11948,6 +11977,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1194811977
init_waitqueue_head(&event->waitq);
1194911978
init_irq_work(&event->pending_irq, perf_pending_irq);
1195011979
init_task_work(&event->pending_task, perf_pending_task);
11980+
rcuwait_init(&event->pending_work_wait);
1195111981

1195211982
mutex_init(&event->mmap_mutex);
1195311983
raw_spin_lock_init(&event->addr_filters.lock);

0 commit comments

Comments
 (0)