summaryrefslogtreecommitdiff
path: root/mm/mmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c2259
1 files changed, 1135 insertions, 1124 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d780f415be3..6e447544f07d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -14,7 +14,6 @@
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -39,7 +38,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -132,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
/*
- * Close a vm structure and free it, returning the next.
+ * Close a vm structure and free it.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
@@ -145,20 +142,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
- return next;
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -200,35 +218,51 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ BUG_ON(brkvma == NULL);
+ if (brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
success:
@@ -247,104 +281,45 @@ out:
return origbrk;
}
-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
-
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
-
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+extern void mt_validate(struct maple_tree *mt);
+extern void mt_dump(const struct maple_tree *mt);
+
+/* Validate the maple tree */
+static void validate_mm_mt(struct mm_struct *mm)
+{
+ struct maple_tree *mt = &mm->mm_mt;
+ struct vm_area_struct *vma_mt;
+
+ MA_STATE(mas, mt, 0, 0);
+
+ mt_validate(&mm->mm_mt);
+ mas_for_each(&mas, vma_mt, ULONG_MAX) {
+ if ((vma_mt->vm_start != mas.index) ||
+ (vma_mt->vm_end - 1 != mas.last)) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma_mt);
+ pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
+ mas.index, mas.last);
+ pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
+ vma_mt->vm_start, vma_mt->vm_end);
+
+ mt_dump(mas.tree);
+ if (vma_mt->vm_end != mas.last + 1) {
+ pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
+ mm, vma_mt->vm_start, vma_mt->vm_end,
+ mas.index, mas.last);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
+ if (vma_mt->vm_start != mas.index) {
+ pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
+ mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
}
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
-
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
}
}
@@ -352,10 +327,13 @@ static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- unsigned long highest_address = 0;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
- while (vma) {
+ validate_mm_mt(mm);
+
+ mas_for_each(&mas, vma, ULONG_MAX) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
@@ -365,93 +343,20 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
- highest_address = vm_end_gap(vma);
- vma = vma->vm_next;
+#endif
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
- bug = 1;
- }
- if (highest_address != mm->highest_vm_end) {
- pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
- bug = 1;
- }
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
-#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
-
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
+#define validate_mm_mt(root) do { } while (0)
+#define validate_mm(mm) do { } while (0)
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -485,208 +390,220 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+ VMA_ITERATOR(vmi, mm, addr);
+ struct vm_area_struct *vma;
+ unsigned long nr_pages = 0;
- mmap_assert_locked(mm);
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vm_start = max(addr, vma->vm_start);
+ unsigned long vm_end = min(end, vma->vm_end);
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
+ nr_pages += PHYS_PFN(vm_end - vm_start);
+ }
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+ return nr_pages;
+}
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
}
/*
- * vma_next() - Get the next VMA.
- * @mm: The mm_struct.
- * @vma: The current vma.
+ * vma_mas_store() - Store a VMA in the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * If @vma is NULL, return the first vma in the mm.
+ * Efficient way to store a VMA in the maple tree when the @mas has already
+ * walked to the correct location.
*
- * Returns: The next VMA after @vma.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
- struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- if (!vma)
- return mm->mmap;
-
- return vma->vm_next;
+ trace_vma_store(mas->tree, vma);
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
}
/*
- * munmap_vma_range() - munmap VMAs that overlap a range.
- * @mm: The mm struct
- * @start: The start of the range.
- * @len: The length of the range.
- * @pprev: pointer to the pointer that will be set to previous vm_area_struct
- * @rb_link: the rb_node
- * @rb_parent: the parent rb_node
- *
- * Find all the vm_area_struct that overlap from @start to
- * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ * vma_mas_remove() - Remove a VMA from the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * Returns: -ENOMEM on munmap failure or 0 on success.
+ * Efficient way to remove a VMA from the maple tree when the @mas has already
+ * been established and points to the correct location.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline int
-munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct rb_node ***link,
- struct rb_node **parent, struct list_head *uf)
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
{
-
- while (find_vma_links(mm, start, start + len, pprev, link, parent))
- if (do_munmap(mm, start, len, uf))
- return -ENOMEM;
-
- return 0;
+ trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1);
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+
+/*
+ * vma_mas_szero() - Set a given range to zero. Used when modifying a
+ * vm_area_struct start or end.
+ *
+ * @mm: The struct_mm
+ * @start: The start address to zero
+ * @end: The end address to zero.
+ */
+static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
+ unsigned long end)
{
- unsigned long nr_pages = 0;
- struct vm_area_struct *vma;
+ trace_vma_mas_szero(mas->tree, start, end - 1);
+ mas_set_range(mas, start, end - 1);
+ mas_store_prealloc(mas, NULL);
+}
- /* Find first overlapping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct address_space *mapping = NULL;
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_start > end)
- break;
+ vma_mas_store(vma, &mas);
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
+ if (mapping) {
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
}
- return nr_pages;
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @mas: The maple state
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct file *file = vma->vm_file;
+ bool remove_next = false;
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
+ if (next && (vma != next) && (end == next->vm_end)) {
+ remove_next = true;
+ if (next->anon_vma && !vma->anon_vma) {
+ int error;
-static void __vma_link_file(struct vm_area_struct *vma)
-{
- struct file *file;
+ anon_vma = next->anon_vma;
+ vma->anon_vma = anon_vma;
+ error = anon_vma_clone(vma, next);
+ if (error)
+ return error;
+ }
+ }
+
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto nomem;
+
+ vma_adjust_trans_huge(vma, start, end, 0);
- file = vma->vm_file;
if (file) {
- struct address_space *mapping = file->f_mapping;
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
+ if (anon_vma) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ if (file) {
flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
+ vma_interval_tree_remove(vma, root);
}
-}
-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
-}
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_mas_store(vma, mas);
-static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- struct address_space *mapping = NULL;
+ if (file) {
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
+ /* Expanding over the next vma */
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
}
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- __vma_link_file(vma);
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(anon_vma);
+ }
- if (mapping)
+ if (file) {
i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+ }
- mm->map_count++;
- validate_mm(mm);
-}
-
-/*
- * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
- */
-static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ }
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
-}
+ validate_mm(mm);
+ return 0;
-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
+nomem:
+ return -ENOMEM;
}
/*
@@ -701,18 +618,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
+ bool vma_changed = false;
long adjust_next = 0;
int remove_next = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -741,10 +659,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
+ if (remove_next == 2)
+ next_next = find_vma(mm, next->vm_end);
+
VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
- /* trim end to next, for case 6 first pass */
- end = next->vm_end;
+ end != next_next->vm_end);
}
exporter = next;
@@ -755,7 +674,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
- exporter = next->vm_next;
+ exporter = next_next;
} else if (end > next->vm_start) {
/*
@@ -792,9 +711,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
return error;
}
}
-again:
- vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
@@ -804,14 +725,14 @@ again:
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
- if (insert) {
+ if (insert && insert->vm_file) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
- __vma_link_file(insert);
+ __vma_link_file(insert, insert->vm_file->f_mapping);
}
}
@@ -835,17 +756,37 @@ again:
}
if (start != vma->vm_start) {
+ if ((vma->vm_start < start) &&
+ (!insert || (insert->vm_end != start))) {
+ vma_mas_szero(&mas, vma->vm_start, start);
+ VM_WARN_ON(insert && insert->vm_start > vma->vm_start);
+ } else {
+ vma_changed = true;
+ }
vma->vm_start = start;
- start_changed = true;
}
if (end != vma->vm_end) {
+ if (vma->vm_end > end) {
+ if (!insert || (insert->vm_start != end)) {
+ vma_mas_szero(&mas, end, vma->vm_end);
+ mas_reset(&mas);
+ VM_WARN_ON(insert &&
+ insert->vm_end < vma->vm_end);
+ }
+ } else {
+ vma_changed = true;
+ }
vma->vm_end = end;
- end_changed = true;
}
+
+ if (vma_changed)
+ vma_mas_store(vma, &mas);
+
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next;
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ vma_mas_store(next, &mas);
}
if (file) {
@@ -855,42 +796,19 @@ again:
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next == 2)
+ __remove_shared_vm_struct(next_next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- __insert_vm_struct(mm, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
+ mas_reset(&mas);
+ vma_mas_store(insert, &mas);
+ mm->map_count++;
}
if (anon_vma) {
@@ -909,6 +827,7 @@ again:
}
if (remove_next) {
+again:
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
@@ -917,66 +836,24 @@ again:
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
+ if (remove_next != 2)
+ BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
- * we must remove another next too. It would clutter
- * up the code too much to do both in one go.
+ * we must remove next_next too.
*/
- if (remove_next != 3) {
- /*
- * If "next" was removed and vma->vm_end was
- * expanded (up) over it, in turn
- * "next->vm_prev->vm_end" changed and the
- * "vma->vm_next" gap must be updated.
- */
- next = vma->vm_next;
- } else {
- /*
- * For the scope of the comment "next" and
- * "vma" considered pre-swap(): if "vma" was
- * removed, next->vm_start was expanded (down)
- * over it and the "next" gap must be updated.
- * Because of the swap() the post-swap() "vma"
- * actually points to pre-swap() "next"
- * (post-swap() "next" as opposed is now a
- * dangling pointer).
- */
- next = vma;
- }
if (remove_next == 2) {
remove_next = 1;
- end = next->vm_end;
+ next = next_next;
goto again;
}
- else if (next)
- vma_gap_update(next);
- else {
- /*
- * If remove_next == 2 we obviously can't
- * reach this path.
- *
- * If remove_next == 3 we can't reach this
- * path because pre-swap() next is always not
- * NULL. pre-swap() "next" is not being
- * removed and its next->vm_end is not altered
- * (and furthermore "end" already matches
- * next->vm_end in remove_next == 3).
- *
- * We reach this only in the remove_next == 1
- * case if the "next" vma that was removed was
- * the highest vma of the mm. However in such
- * case next->vm_end == "end" and the extended
- * "vma" has vma->vm_end == next->vm_end so
- * mm->highest_vm_end doesn't need any update
- * in remove_next == 1 case.
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
}
if (insert && file)
uprobe_mmap(insert);
+ mas_destroy(&mas);
validate_mm(mm);
return 0;
@@ -1128,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *area, *next;
- int err;
+ struct vm_area_struct *mid, *next, *res;
+ int err = -1;
+ bool merge_prev = false;
+ bool merge_next = false;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -1138,76 +1017,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- next = vma_next(mm, prev);
- area = next;
- if (area && area->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = find_vma(mm, prev ? prev->vm_end : 0);
+ mid = next;
+ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = find_vma(mm, next->vm_end);
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(mid && end > mid->vm_end);
VM_WARN_ON(addr >= end);
- /*
- * Can it merge with the predecessor?
- */
+ /* Can we merge the predecessor? */
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx, anon_name)) {
- /*
- * OK, it can. Can we now merge in the successor as well?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file,
- pgoff+pglen,
- vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) {
- /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- } else /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- if (err)
- return NULL;
- khugepaged_enter_vma(prev, vm_flags);
- return prev;
+ merge_prev = true;
}
-
- /*
- * Can this new request be merged in front of next?
- */
+ /* Can we merge the successor? */
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx, anon_name)) {
+ merge_next = true;
+ }
+ /* Can we merge both the predecessor and the successor? */
+ if (merge_prev && merge_next &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL)) { /* cases 1, 6 */
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
+ res = prev;
+ } else if (merge_prev) { /* cases 2, 5, 7 */
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
+ res = prev;
+ } else if (merge_next) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else { /* cases 3, 8 */
- err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
- /*
- * In case 3 area is already equal to next and
- * this is a noop, but in case 8 "area" has
- * been removed and next was expanded over it.
- */
- area = next;
- }
- if (err)
- return NULL;
- khugepaged_enter_vma(area, vm_flags);
- return area;
+ addr, prev->vm_pgoff, NULL, next);
+ else /* cases 3, 8 */
+ err = __vma_adjust(mid, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ res = next;
}
- return NULL;
+ /*
+ * Cannot merge with predecessor or successor or error in __vma_adjust?
+ */
+ if (err)
+ return NULL;
+ khugepaged_enter_vma(res, vm_flags);
+ return res;
}
/*
@@ -1275,18 +1139,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
/* Try next first. */
- if (vma->vm_next) {
- anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ next = mas_walk(&mas);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
+ prev = mas_prev(&mas, 0);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = mas_prev(&mas, 0);
/* Try prev next. */
- if (vma->vm_prev)
- anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
@@ -1375,6 +1245,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
+ validate_mm(mm);
*populate = 0;
if (!len)
@@ -1678,388 +1549,63 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
-
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
- vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
-
- /* If vm_flags changed after call_mmap(), we should try merge vma again
- * as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (merge) {
- /* ->mmap() can change vma->vm_file and fput the original file. So
- * fput the vma->vm_file here or we would add an extra fput for file
- * and cause general protection fault ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto unmap_writable;
- }
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- if (file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
-
- /*
- * vma_merge() calls khugepaged_enter_vma() either, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
- /* Once vma denies write, undo our temporary denial count */
-unmap_writable:
- if (file && vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
- file = vma->vm_file;
-out:
- perf_event_mmap(vma);
-
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
-}
-
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
- /*
- * We implement the search by looking for an rbtree node that
- * immediately follows a suitable gap. That is,
- * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
- * - gap_end = vma->vm_start >= info->low_limit + length;
- * - gap_end - gap_start >= length
- */
+ unsigned long length, gap;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /* Adjust search limits by the desired length */
- if (info->high_limit < length)
- return -ENOMEM;
- high_limit = info->high_limit - length;
-
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- goto check_highest;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- goto check_highest;
-
- while (true) {
- /* Visit left subtree if it looks promising */
- gap_end = vm_start_gap(vma);
- if (gap_end >= low_limit && vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
-check_current:
- /* Check if current node has a suitable gap */
- if (gap_start > high_limit)
- return -ENOMEM;
- if (gap_end >= low_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit right subtree if it looks promising */
- if (vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- goto check_highest;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_left) {
- gap_start = vm_end_gap(vma->vm_prev);
- gap_end = vm_start_gap(vma);
- goto check_current;
- }
- }
- }
-
-check_highest:
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
- if (gap_start > high_limit)
+ if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
-found:
- /* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
- gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
-
- VM_BUG_ON(gap_start + info->length > info->high_limit);
- VM_BUG_ON(gap_start + info->length > gap_end);
- return gap_start;
+ gap = mas.index;
+ gap += (info->align_offset - gap) & info->align_mask;
+ return gap;
}
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with * the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long length, gap;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /*
- * Adjust search limits by the desired length.
- * See implementation comment at top of unmapped_area().
- */
- gap_end = info->high_limit;
- if (gap_end < length)
- return -ENOMEM;
- high_limit = gap_end - length;
-
- if (info->low_limit > high_limit)
+ if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- if (gap_start <= high_limit)
- goto found_highest;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- return -ENOMEM;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- return -ENOMEM;
-
- while (true) {
- /* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
- if (gap_start <= high_limit && vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
-check_current:
- /* Check if current node has a suitable gap */
- gap_end = vm_start_gap(vma);
- if (gap_end < low_limit)
- return -ENOMEM;
- if (gap_start <= high_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit left subtree if it looks promising */
- if (vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- return -ENOMEM;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_right) {
- gap_start = vma->vm_prev ?
- vm_end_gap(vma->vm_prev) : 0;
- goto check_current;
- }
- }
- }
-
-found:
- /* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
- gap_end = info->high_limit;
-found_highest:
- /* Compute highest gap address at the desired alignment */
- gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
-
- VM_BUG_ON(gap_end < info->low_limit);
- VM_BUG_ON(gap_end < gap_start);
- return gap_end;
+ gap = mas.last + 1 - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ return gap;
}
/*
@@ -2232,6 +1778,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Ensures that larger anonymous mappings are THP aligned. */
+ get_area = thp_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
@@ -2249,58 +1798,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
+ unsigned long index = start_addr;
mmap_assert_locked(mm);
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *tmp;
-
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+/**
+ * find_vma() - Find the VMA for a given address, or the next VMA.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ *
+ * Returns: The VMA associated with addr, or the next VMA.
+ * May return %NULL in the case of no VMA at addr or above.
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long index = addr;
- if (vma)
- vmacache_update(addr, vma);
- return vma;
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
-
EXPORT_SYMBOL(find_vma);
-/*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+/**
+ * find_vma_prev() - Find the VMA for a given address, or the next vma and
+ * set %pprev to the previous VMA, if any.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ * @pprev: The pointer to set to the previous VMA
+ *
+ * Note that RCU lock is missing here since the external mmap_lock() is used
+ * instead.
+ *
+ * Returns: The VMA associated with @addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- vma = find_vma(mm, addr);
- if (vma) {
- *pprev = vma->vm_prev;
- } else {
- struct rb_node *rb_node = rb_last(&mm->mm_rb);
-
- *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
- }
+ vma = mas_walk(&mas);
+ *pprev = mas_prev(&mas, 0);
+ if (!vma)
+ vma = mas_next(&mas, ULONG_MAX);
return vma;
}
@@ -2354,6 +1912,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -2371,16 +1930,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
- next = vma->vm_next;
- if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2401,15 +1965,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2417,11 +1979,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2430,7 +1990,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2438,10 +1998,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
@@ -2450,7 +2010,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -EPERM;
/* Enforce stack_guard_gap */
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev)) {
@@ -2458,9 +2018,14 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2481,15 +2046,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2498,8 +2061,9 @@ int expand_downwards(struct vm_area_struct *vma,
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2508,7 +2072,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
@@ -2581,25 +2145,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL_GPL(find_extend_vma);
/*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+ * and do the vma updates.
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
unsigned long nr_accounted = 0;
+ struct vm_area_struct *vma;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
- do {
+ mas_for_each(mas, vma, ULONG_MAX) {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
+ remove_vma(vma);
+ }
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -2609,67 +2174,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
+ struct vm_area_struct *next,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end);
- free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mt, vma, start, end);
+ free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
}
/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
-{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
-
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- vma_rb_erase(vma, &mm->mm_rb);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= vma_pages(vma);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- *insertion_point = vma;
- if (vma) {
- vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
- mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
- tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
-
- /*
- * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
- * VM_GROWSUP VMA. Such VMAs can change their size under
- * down_read(mmap_lock) and collide with the VMA we are about to unmap.
- */
- if (vma && (vma->vm_flags & VM_GROWSDOWN))
- return false;
- if (prev && (prev->vm_flags & VM_GROWSUP))
- return false;
- return true;
-}
-
-/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
@@ -2678,6 +2199,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct *new;
int err;
+ validate_mm_mt(mm);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
@@ -2720,6 +2242,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!err)
return 0;
+ /* Avoid vm accounting in close() operation */
+ new->vm_start = new->vm_end;
+ new->vm_pgoff = 0;
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
@@ -2730,6 +2255,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mpol_put(vma_policy(new));
out_free_vma:
vm_area_free(new);
+ validate_mm_mt(mm);
return err;
}
@@ -2746,38 +2272,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-/* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf, bool downgrade)
+static inline int munmap_sidetree(struct vm_area_struct *vma,
+ struct ma_state *mas_detach)
{
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
+ mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ return -ENOMEM;
- len = PAGE_ALIGN(len);
- end = start + len;
- if (len == 0)
- return -EINVAL;
+ if (vma->vm_flags & VM_LOCKED)
+ vma->vm_mm->locked_vm -= vma_pages(vma);
- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
- arch_unmap(mm, start, end);
+ return 0;
+}
- /* Find the first overlapping VMA where start < vma->vm_end */
- vma = find_vma_intersection(mm, start, end);
- if (!vma)
- return 0;
- prev = vma->vm_prev;
+/*
+ * do_mas_align_munmap() - munmap the aligned region from @start to @end.
+ * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ *
+ * If @downgrade is true, check return code for potential release of the lock.
+ */
+static int
+do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool downgrade)
+{
+ struct vm_area_struct *prev, *next = NULL;
+ struct maple_tree mt_detach;
+ int count = 0;
+ int error = -ENOMEM;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+ mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2785,8 +2321,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
+
+ /* Does it split the first one? */
if (start > vma->vm_start) {
- int error;
/*
* Make sure that map_count on return from munmap() will
@@ -2794,22 +2331,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
+ goto map_count_exceeded;
+ /*
+ * mas_pause() is not needed since mas->index needs to be set
+ * differently than vma->vm_end anyways.
+ */
error = __split_vma(mm, vma, start, 0);
if (error)
- return error;
- prev = vma;
+ goto start_split_failed;
+
+ mas_set(mas, start);
+ vma = mas_walk(mas);
}
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
+ prev = mas_prev(mas, 0);
+ if (unlikely((!prev)))
+ mas_set(mas, start);
+
+ /*
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
+ */
+ mas_for_each(mas, next, end - 1) {
+ /* Does it split the end? */
+ if (next->vm_end > end) {
+ struct vm_area_struct *split;
+
+ error = __split_vma(mm, next, end, 1);
+ if (error)
+ goto end_split_failed;
+
+ mas_set(mas, end);
+ split = mas_prev(mas, 0);
+ error = munmap_sidetree(split, &mas_detach);
+ if (error)
+ goto munmap_sidetree_failed;
+
+ count++;
+ if (vma == next)
+ vma = split;
+ break;
+ }
+ error = munmap_sidetree(next, &mas_detach);
if (error)
- return error;
+ goto munmap_sidetree_failed;
+
+ count++;
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < start);
+ BUG_ON(next->vm_start > end);
+#endif
}
- vma = vma_next(mm, prev);
+
+ if (!next)
+ next = mas_next(mas, ULONG_MAX);
if (unlikely(uf)) {
/*
@@ -2821,30 +2397,366 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ error = userfaultfd_unmap_prep(mm, start, end, uf);
+
if (error)
- return error;
+ goto userfaultfd_error;
+ }
+
+ /* Point of no return */
+ mas_set_range(mas, start, end - 1);
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, &mt_detach, start, end - 1);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ rcu_read_lock();
+ vma_test = mas_find(&test, end - 1);
+ mas_for_each(mas, vma_mas, end - 1) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, end - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(count != test_count);
+ mas_set_range(mas, start, end - 1);
+ }
+#endif
+ mas_store_prealloc(mas, NULL);
+ mm->map_count -= count;
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (downgrade) {
+ if (next && (next->vm_flags & VM_GROWSDOWN))
+ downgrade = false;
+ else if (prev && (prev->vm_flags & VM_GROWSUP))
+ downgrade = false;
+ else
+ mmap_write_downgrade(mm);
}
- /* Detach vmas from rbtree */
- if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
- downgrade = false;
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end);
+ /* Statistics and freeing VMAs */
+ mas_set(&mas_detach, start);
+ remove_mt(mm, &mas_detach);
+ __mt_destroy(&mt_detach);
- if (downgrade)
- mmap_write_downgrade(mm);
- unmap_region(mm, vma, prev, start, end);
+ validate_mm(mm);
+ return downgrade ? 1 : 0;
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
+userfaultfd_error:
+munmap_sidetree_failed:
+end_split_failed:
+ __mt_destroy(&mt_detach);
+start_split_failed:
+map_count_exceeded:
+ mas_destroy(mas);
+ return error;
+}
- return downgrade ? 1 : 0;
+/*
+ * do_mas_munmap() - munmap a given range.
+ * @mas: The maple state
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @downgrade: set to true if the user wants to attempt to write_downgrade the
+ * mmap_sem
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned and any arch_unmap work will be preformed.
+ *
+ * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ */
+int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
+
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, start, end);
+
+ /* Find the first overlapping VMA */
+ vma = mas_find(mas, end - 1);
+ if (!vma)
+ return 0;
+
+ return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
}
+/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length to be munmapped.
+ * @uf: The userfaultfd list_head
+ */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- return __do_munmap(mm, start, len, uf, false);
+ MA_STATE(mas, &mm->mm_mt, start, start);
+
+ return do_mas_munmap(&mas, mm, start, len, uf, false);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *next, *prev, *merge;
+ pgoff_t pglen = len >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, end);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Unmap any existing mapping in the area */
+ if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ return -ENOMEM;
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ next = mas_next(&mas, ULONG_MAX);
+ prev = mas_prev(&mas, 0);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX, NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma(vma, vm_flags);
+ goto expanded;
+ }
+
+ mas.index = addr;
+ mas.last = end - 1;
+cannot_expand:
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = end;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto free_vma;
+ }
+
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ */
+ WARN_ON_ONCE(addr != vma->vm_start);
+
+ addr = vma->vm_start;
+ mas_reset(&mas);
+
+ /*
+ * If vm_flags changed after call_mmap(), we should try merge
+ * vma again as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
+ if (merge) {
+ /*
+ * ->mmap() can change vma->vm_file and fput
+ * the original file. So fput the vma->vm_file
+ * here or we would add an extra fput for file
+ * and cause general protection fault
+ * ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ error = -ENOMEM;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_mas_store(vma, &mas);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
+ /* Once vma denies write, undo our temporary denial count */
+unmap_writable:
+ if (file && vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ file = vma->vm_file;
+expanded:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
+ validate_mm(mm);
+ return addr;
+
+unmap_and_free_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ validate_mm(mm);
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
@@ -2852,11 +2764,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, start, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __do_munmap(mm, start, len, &uf, downgrade);
+ ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2922,11 +2835,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
if (start + size > vma->vm_end) {
- struct vm_area_struct *next;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ struct vm_area_struct *next, *prev = vma;
- for (next = vma->vm_next; next; next = next->vm_next) {
+ for_each_vma_range(vmi, next, start + size) {
/* hole between vmas ? */
- if (next->vm_start != next->vm_prev->vm_end)
+ if (next->vm_start != prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
@@ -2935,8 +2849,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (next->vm_flags != vma->vm_flags)
goto out;
- if (start + size <= next->vm_end)
- break;
+ prev = next;
}
if (!next)
@@ -2966,37 +2879,53 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
-
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ struct mm_struct *mm = vma->vm_mm;
+ int ret;
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ arch_unmap(mm, newbrk, oldbrk);
+ ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true);
+ validate_mm_mt(mm);
+ return ret;
+}
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
- /* Check against address space limits *after* clearing old maps... */
+ validate_mm_mt(mm);
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -3006,28 +2935,49 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma &&
+ (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+ mas->index = vma->vm_start;
+ mas->last = addr + len - 1;
+ vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_expand_failed;
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;
+
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -3035,16 +2985,32 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
+ validate_mm(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+
+mas_expand_failed:
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3055,13 +3021,36 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0);
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ if (!vma || vma->vm_end != addr || vma_policy(vma) ||
+ !can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
+ vma = NULL;
+
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
@@ -3077,34 +3066,19 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int count = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Manually reap the mm to free as much memory as possible.
- * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_lock for
- * write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_lock is
- * dropped.
- *
- * Nothing can be holding mm->mmap_lock here and the above call
- * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
- * __oom_reap_task_mm() will not block.
- */
- (void)__oom_reap_task_mm(mm);
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
-
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mm->mmap;
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
return;
}
@@ -3112,19 +3086,37 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX);
+ mmap_read_unlock(mm);
+
+ /*
+ * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
+ * because the memory has been already freed.
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
- /* Walk the list again, actually closing and freeing it. */
- while (vma) {
+ /*
+ * Walk the list again, actually closing and freeing it, with preemption
+ * enabled, without holding any MM locks besides the unreachable
+ * mmap_write_lock.
+ */
+ do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ count++;
cond_resched();
- }
- mm->mmap = NULL;
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+ BUG_ON(count != mm->map_count);
+
+ trace_exit_mmap(mm);
+ __mt_destroy(&mm->mm_mt);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3135,14 +3127,14 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = vma_pages(vma);
+
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
return -ENOMEM;
+
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
/*
@@ -3162,7 +3154,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, vma)) {
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3178,9 +3174,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
+ validate_mm_mt(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3190,8 +3186,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ new_vma = find_vma_prev(mm, addr, &prev);
+ if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
+
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
@@ -3232,16 +3230,22 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
*need_rmap_locks = false;
}
+ validate_mm_mt(mm);
return new_vma;
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
+ validate_mm_mt(mm);
return NULL;
}
@@ -3378,6 +3382,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
+ validate_mm_mt(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3400,10 +3405,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
+ validate_mm_mt(mm);
return vma;
out:
vm_area_free(vma);
+ validate_mm_mt(mm);
return ERR_PTR(ret);
}
@@ -3528,12 +3535,13 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3541,7 +3549,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3549,7 +3558,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3608,11 +3618,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);