Perf fixes for perf_mmap() reference counting to prevent potential

reference count leaks which are caused by:
 
  - VMA splits, which change the offset or size of a mapping, which causes
    perf_mmap_close() to ignore the unmap or unmap the wrong buffer.
 
  - Several internal issues of perf_mmap(), which can cause reference count
    leaks in the perf mmap, corrupt accounting or cause leaks in perf
    drivers.
 
 The main fix is to prevent VMA splits by implementing the [may_]split()
 callback for vm operations. The other issues are addressed by rearranging
 code, early returns on failure and invocation of cleanups.
 
 Also provide a selftest to validate the fixes.
 
 The reference counting should be converted to refcount_t, but that requires
 larger refactoring of the code and will be done once these fixes are
 upstream.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmiSd0gTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoWCbD/9niCHArXIWfdIp1K2ZwM2+tsCU7Ntl
 XnkfnaPoCVpQQXcAN11WIEK6DlwaiY2sfOco1cpEqr5px0Hv5qzM+OGm0r3KQBpr
 9d1Ox8+tqUOIxw5zsKFXpw6/WX4zzdxGHjzU/T0H+fPP3jB+hj/Q3hOk4u10+f3v
 7f3Q4sOfkmOauQez2HEaDwUip6lLZFEaf8IK0tYEkOJxcStwsC2TnLvmlEmOA0Yx
 PnAXOicrpbe9d8KNq6VxU0OtV6XAT+YJtf9T5cTNR1NhIkqyaMwbdzkuh9RZgxAE
 oRblaAHubAUMmv2DgYOTUGoYivsXY13XOtjfXdLmxt19HmkSOyaCFO8nJgjAPOL7
 gxGXS7zKxhNac7bfVgBANPUHOOWtV30H5CqYOzxaPlQs8gzOsl8l+NDZuwVlP4P6
 CMdN3rz3eMpnMpuzy0mmUJhowytKDA8N81yamCP5L9hWWZVfp4boZfIXMMLtJdQa
 nv/T2HxLL8HweFrI6Wd7YDhXMKhsNDAqJvtSv0z+5U+PWWd9rcOFsgS9sUHIiJuB
 pLvNwLxPntzF6qw4qIp1W1AHfLz2VF/tR8WyINpEZe4oafP1TccI+aLQdIJ/vVqp
 gQ0bCTiZb16IGsHruu4L9C0fe40TdSuiwEK5X9Opk4aP11oagsqQ+GxzssvQZnZc
 Jx2XqouabWBBvQ==
 =B9L/
 -----END PGP SIGNATURE-----

Merge tag 'perf-fixes-27504' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git

Pull perf fixes from Thomas Gleixner:
 "Perf fixes for perf_mmap() reference counting to prevent potential
  reference count leaks which are caused by:

   - VMA splits, which change the offset or size of a mapping, which
     causes perf_mmap_close() to ignore the unmap or unmap the wrong
     buffer.

   - Several internal issues of perf_mmap(), which can cause reference
     count leaks in the perf mmap, corrupt accounting or cause leaks in
     perf drivers.

  The main fix is to prevent VMA splits by implementing the
  [may_]split() callback for vm operations.

  The other issues are addressed by rearranging code, early returns on
  failure and invocation of cleanups.

  Also provide a selftest to validate the fixes.

  The reference counting should be converted to refcount_t, but that
  requires larger refactoring of the code and will be done once these
  fixes are upstream"

* tag 'perf-fixes-27504' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git:
  selftests/perf_events: Add a mmap() correctness test
  perf/core: Prevent VMA split of buffer mappings
  perf/core: Handle buffer mapping fail correctly in perf_mmap()
  perf/core: Exit early on perf_mmap() fail
  perf/core: Don't leak AUX buffer refcount on allocation failure
  perf/core: Preserve AUX buffer allocation failure result
pull/1315/head
Linus Torvalds 2025-08-06 04:41:21 +03:00
commit adf12a394c
4 changed files with 266 additions and 9 deletions

View File

@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
{
/*
* Forbid splitting perf mappings to prevent refcount leaks due to
* the resulting non-matching offsets and sizes. See open()/close().
*/
return -EINVAL;
}
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
.may_split = perf_mmap_may_split,
};
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
@ -7051,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = 0;
goto unlock;
}
atomic_set(&rb->aux_mmap_count, 1);
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
@ -7115,15 +7123,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
if (!ret)
if (!ret) {
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
}
ret = 0;
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@ -7131,6 +7140,7 @@ unlock:
atomic_inc(&event->mmap_count);
} else if (rb) {
/* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
@ -7138,6 +7148,9 @@ aux_unlock:
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret)
return ret;
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@ -7145,13 +7158,20 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (!ret)
ret = map_range(rb, vma);
mapped = get_mapped(event, event_mapped);
if (mapped)
mapped(event, vma->vm_mm);
/*
* Try to map it into the page table. On fail, invoke
* perf_mmap_close() to undo the above, as the callsite expects
* full cleanup in this case and therefore does not invoke
* vmops::close().
*/
ret = map_range(rb, vma);
if (ret)
perf_mmap_close(vma);
return ret;
}

View File

@ -2,3 +2,4 @@
sigtrap_threads
remove_on_exec
watermark_signal
mmap

View File

@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread
TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal mmap
include ../lib.mk

View File

@ -0,0 +1,236 @@
// SPDX-License-Identifier: GPL-2.0-only
#define _GNU_SOURCE
#include <dirent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <linux/perf_event.h>
#include "../kselftest_harness.h"
#define RB_SIZE 0x3000
#define AUX_SIZE 0x10000
#define AUX_OFFS 0x4000
#define HOLE_SIZE 0x1000
/* Reserve space for rb, aux with space for shrink-beyond-vma testing. */
#define REGION_SIZE (2 * RB_SIZE + 2 * AUX_SIZE)
#define REGION_AUX_OFFS (2 * RB_SIZE)
#define MAP_BASE 1
#define MAP_AUX 2
#define EVENT_SRC_DIR "/sys/bus/event_source/devices"
FIXTURE(perf_mmap)
{
int fd;
void *ptr;
void *region;
};
FIXTURE_VARIANT(perf_mmap)
{
bool aux;
unsigned long ptr_size;
};
FIXTURE_VARIANT_ADD(perf_mmap, rb)
{
.aux = false,
.ptr_size = RB_SIZE,
};
FIXTURE_VARIANT_ADD(perf_mmap, aux)
{
.aux = true,
.ptr_size = AUX_SIZE,
};
static bool read_event_type(struct dirent *dent, __u32 *type)
{
char typefn[512];
FILE *fp;
int res;
snprintf(typefn, sizeof(typefn), "%s/%s/type", EVENT_SRC_DIR, dent->d_name);
fp = fopen(typefn, "r");
if (!fp)
return false;
res = fscanf(fp, "%u", type);
fclose(fp);
return res > 0;
}
FIXTURE_SETUP(perf_mmap)
{
struct perf_event_attr attr = {
.size = sizeof(attr),
.disabled = 1,
.exclude_kernel = 1,
.exclude_hv = 1,
};
struct perf_event_attr attr_ok = {};
unsigned int eacces = 0, map = 0;
struct perf_event_mmap_page *rb;
struct dirent *dent;
void *aux, *region;
DIR *dir;
self->ptr = NULL;
dir = opendir(EVENT_SRC_DIR);
if (!dir)
SKIP(return, "perf not available.");
region = mmap(NULL, REGION_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0);
ASSERT_NE(region, MAP_FAILED);
self->region = region;
// Try to find a suitable event on this system
while ((dent = readdir(dir))) {
int fd;
if (!read_event_type(dent, &attr.type))
continue;
fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
if (fd < 0) {
if (errno == EACCES)
eacces++;
continue;
}
// Check whether the event supports mmap()
rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
if (rb == MAP_FAILED) {
close(fd);
continue;
}
if (!map) {
// Save the event in case that no AUX capable event is found
attr_ok = attr;
map = MAP_BASE;
}
if (!variant->aux)
continue;
rb->aux_offset = AUX_OFFS;
rb->aux_size = AUX_SIZE;
// Check whether it supports a AUX buffer
aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, AUX_OFFS);
if (aux == MAP_FAILED) {
munmap(rb, RB_SIZE);
close(fd);
continue;
}
attr_ok = attr;
map = MAP_AUX;
munmap(aux, AUX_SIZE);
munmap(rb, RB_SIZE);
close(fd);
break;
}
closedir(dir);
if (!map) {
if (!eacces)
SKIP(return, "No mappable perf event found.");
else
SKIP(return, "No permissions for perf_event_open()");
}
self->fd = syscall(SYS_perf_event_open, &attr_ok, 0, -1, -1, 0);
ASSERT_NE(self->fd, -1);
rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, self->fd, 0);
ASSERT_NE(rb, MAP_FAILED);
if (!variant->aux) {
self->ptr = rb;
return;
}
if (map != MAP_AUX)
SKIP(return, "No AUX event found.");
rb->aux_offset = AUX_OFFS;
rb->aux_size = AUX_SIZE;
aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, self->fd, AUX_OFFS);
ASSERT_NE(aux, MAP_FAILED);
self->ptr = aux;
}
FIXTURE_TEARDOWN(perf_mmap)
{
ASSERT_EQ(munmap(self->region, REGION_SIZE), 0);
if (self->fd != -1)
ASSERT_EQ(close(self->fd), 0);
}
TEST_F(perf_mmap, remap)
{
void *tmp, *ptr = self->ptr;
unsigned long size = variant->ptr_size;
// Test the invalid remaps
ASSERT_EQ(mremap(ptr, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
ASSERT_EQ(mremap(ptr + HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
ASSERT_EQ(mremap(ptr + size - HOLE_SIZE, HOLE_SIZE, size, MREMAP_MAYMOVE), MAP_FAILED);
// Shrink the end of the mapping such that we only unmap past end of the VMA,
// which should succeed and poke a hole into the PROT_NONE region
ASSERT_NE(mremap(ptr + size - HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
// Remap the whole buffer to a new address
tmp = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(tmp, MAP_FAILED);
// Try splitting offset 1 hole size into VMA, this should fail
ASSERT_EQ(mremap(ptr + HOLE_SIZE, size - HOLE_SIZE, size - HOLE_SIZE,
MREMAP_MAYMOVE | MREMAP_FIXED, tmp), MAP_FAILED);
// Remapping the whole thing should succeed fine
ptr = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tmp);
ASSERT_EQ(ptr, tmp);
ASSERT_EQ(munmap(tmp, size), 0);
}
TEST_F(perf_mmap, unmap)
{
unsigned long size = variant->ptr_size;
// Try to poke holes into the mappings
ASSERT_NE(munmap(self->ptr, HOLE_SIZE), 0);
ASSERT_NE(munmap(self->ptr + HOLE_SIZE, HOLE_SIZE), 0);
ASSERT_NE(munmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE), 0);
}
TEST_F(perf_mmap, map)
{
unsigned long size = variant->ptr_size;
// Try to poke holes into the mappings by mapping anonymous memory over it
ASSERT_EQ(mmap(self->ptr, HOLE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
ASSERT_EQ(mmap(self->ptr + HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
ASSERT_EQ(mmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
}
TEST_HARNESS_MAIN