mm/vma: move stack expansion logic to mm/vma.c

We build on previous work making expand_downwards() an entirely internal
function.

This logic is subtle and so it is highly useful to get it into vma.c so we
can then userland unit test.

We must additionally move acct_stack_growth() to vma.c as it is a helper
function used by both expand_downwards() and expand_upwards().

We are also then able to mark anon_vma_interval_tree_pre_update_vma() and
anon_vma_interval_tree_post_update_vma() static as these are no longer
used by anything else.

Link: https://lkml.kernel.org/r/0feb104eff85922019d4fb29280f3afb130c5204.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
pull/1134/merge
Lorenzo Stoakes 2024-12-03 18:05:11 +00:00 committed by Andrew Morton
parent 7a57149918
commit a9d1f3f2d7
5 changed files with 310 additions and 243 deletions

205
mm/mmap.c
View File

@ -879,211 +879,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
return vma;
}
/*
* Verify that the stack growth is acceptable and
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
/* mlock limit tests */
if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
return 0;
}
#if defined(CONFIG_STACK_GROWSUP)
/*
* PA-RISC uses this for its stack.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
mmap_assert_write_locked(mm);
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
return -ENOMEM;
address += PAGE_SIZE;
/* Enforce stack_guard_gap */
gap_addr = address + stack_guard_gap;
/* Guard against overflow */
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
next = find_vma_intersection(mm, vma->vm_end, gap_addr);
if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
if (next)
vma_iter_prev_range_limit(&vmi, address);
vma_iter_config(&vmi, vma->vm_start, address);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
vma_iter_free(&vmi);
return -ENOMEM;
}
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
/* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
unsigned long size, grow;
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
error = -ENOMEM;
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP */
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
* mmap_lock held for writing.
*/
static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
int error = 0;
VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
mmap_assert_write_locked(mm);
address &= PAGE_MASK;
if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
/* Enforce stack_guard_gap */
prev = vma_prev(&vmi);
/* Check that both stack segments have the same anon_vma? */
if (prev) {
if (!(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev) &&
(address - prev->vm_end < stack_guard_gap))
return -ENOMEM;
}
if (prev)
vma_iter_next_range_limit(&vmi, vma->vm_start);
vma_iter_config(&vmi, address, vma->vm_end);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
vma_iter_free(&vmi);
return -ENOMEM;
}
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
/* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
unsigned long size, grow;
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
error = -ENOMEM;
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

269
mm/vma.c
View File

@ -202,6 +202,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
*
* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
* vma must be removed from the anon_vma's interval trees using
* anon_vma_interval_tree_pre_update_vma().
*
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
* The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
static void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}
static void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
/*
* vma_prepare() - Helper function for handling locking VMAs prior to altering
* @vp: The initialized vma_prepare struct
@ -510,38 +542,6 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return __split_vma(vmi, vma, addr, new_below);
}
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
*
* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
* vma must be removed from the anon_vma's interval trees using
* anon_vma_interval_tree_pre_update_vma().
*
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
* The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}
void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
/*
* dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
@ -2672,3 +2672,208 @@ retry:
return gap;
}
/*
* Verify that the stack growth is acceptable and
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
/* mlock limit tests */
if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
return 0;
}
#if defined(CONFIG_STACK_GROWSUP)
/*
* PA-RISC uses this for its stack.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
mmap_assert_write_locked(mm);
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
return -ENOMEM;
address += PAGE_SIZE;
/* Enforce stack_guard_gap */
gap_addr = address + stack_guard_gap;
/* Guard against overflow */
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
next = find_vma_intersection(mm, vma->vm_end, gap_addr);
if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
if (next)
vma_iter_prev_range_limit(&vmi, address);
vma_iter_config(&vmi, vma->vm_start, address);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
vma_iter_free(&vmi);
return -ENOMEM;
}
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
/* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
unsigned long size, grow;
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
error = -ENOMEM;
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP */
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
* mmap_lock held for writing.
*/
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
int error = 0;
VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
mmap_assert_write_locked(mm);
address &= PAGE_MASK;
if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
/* Enforce stack_guard_gap */
prev = vma_prev(&vmi);
/* Check that both stack segments have the same anon_vma? */
if (prev) {
if (!(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev) &&
(address - prev->vm_end < stack_guard_gap))
return -ENOMEM;
}
if (prev)
vma_iter_next_range_limit(&vmi, vma->vm_start);
vma_iter_config(&vmi, address, vma->vm_end);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
vma_iter_free(&vmi);
return -ENOMEM;
}
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
/* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
unsigned long size, grow;
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
error = -ENOMEM;
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
vma_iter_free(&vmi);
validate_mm(mm);
return error;
}

View File

@ -139,12 +139,6 @@ void validate_mm(struct mm_struct *mm);
#define validate_mm(mm) do { } while (0)
#endif
/* Required for expand_downwards(). */
void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
/* Required for expand_downwards(). */
void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
int vma_expand(struct vma_merge_struct *vmg);
int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long start, unsigned long end, pgoff_t pgoff);
@ -478,4 +472,10 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
#endif
#if defined(CONFIG_STACK_GROWSUP)
int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#endif
int expand_downwards(struct vm_area_struct *vma, unsigned long address);
#endif /* __MM_VMA_H */

View File

@ -53,6 +53,11 @@ struct task_struct *get_current(void)
return &__current;
}
unsigned long rlimit(unsigned int limit)
{
return (unsigned long)-1;
}
/* Helper function to simply allocate a VMA. */
static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
unsigned long start,

View File

@ -79,6 +79,11 @@ extern unsigned long dac_mmap_min_addr;
#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
#define RLIMIT_STACK 3 /* max stack size */
#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */
#define CAP_IPC_LOCK 14
#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED _BITUL(63)
@ -478,6 +483,8 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
extern const struct vm_operations_struct vma_dummy_vm_ops;
extern unsigned long rlimit(unsigned int limit);
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
memset(vma, 0, sizeof(*vma));
@ -1114,4 +1121,59 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
return vm_end;
}
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr, unsigned long len)
{
return 0;
}
static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_ACCESS_FLAGS;
}
static inline bool capable(int cap)
{
return true;
}
static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
locked_pages += mm->locked_vm;
limit_pages = rlimit(RLIMIT_MEMLOCK);
limit_pages >>= PAGE_SHIFT;
return locked_pages <= limit_pages;
}
static inline int __anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
if (!anon_vma)
return -ENOMEM;
anon_vma->root = anon_vma;
vma->anon_vma = anon_vma;
return 0;
}
static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
if (likely(vma->anon_vma))
return 0;
return __anon_vma_prepare(vma);
}
#endif /* __MM_VMA_INTERNAL_H */