summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-04-14 15:09:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-04-14 15:09:40 -0700
commit6b3a707736301c2128ca85ce85fb13f60b5e350a (patch)
tree2bf1892cf29121150adece8d1221ecd513a4e792
parent4443f8e6ac7755cd775c70d08be8042dc2f936cb (diff)
parent15fab63e1e57be9fdb5eec1bbc5916e9825e9acb (diff)
Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch. Jann Horn reported that he can overflow the page ref count with sufficient memory (and a filesystem that is intentionally extremely slow). Admittedly it's not exactly easy. To have more than four billion references to a page requires a minimum of 32GB of kernel memory just for the pointers to the pages, much less any metadata to keep track of those pointers. Jann needed a total of 140GB of memory and a specially crafted filesystem that leaves all reads pending (in order to not ever free the page references and just keep adding more). Still, we have a fairly straightforward way to limit the two obvious user-controllable sources of page references: direct-IO like page references gotten through get_user_pages(), and the splice pipe page duplication. So let's just do that. * branch page-refs: fs: prevent page refcount overflow in pipe_buf_get mm: prevent get_user_pages() from overflowing page refcount mm: add 'try_get_page()' helper function mm: make page ref count overflow check tighter and more explicit
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/splice.c12
-rw-r--r--include/linux/mm.h15
-rw-r--r--include/linux/pipe_fs_i.h10
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--mm/gup.c48
-rw-r--r--mm/hugetlb.c13
8 files changed, 92 insertions, 28 deletions
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8a63e52785e9..9971a35cf1ef 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
ret = -EINVAL;
- if (rem < len) {
- pipe_unlock(pipe);
- goto out;
- }
+ if (rem < len)
+ goto out_free;
rem = len;
while (rem) {
@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
} else {
- pipe_buf_get(pipe, ibuf);
+ if (!pipe_buf_get(pipe, ibuf))
+ goto out_free;
+
*obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem;
@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
pipe_lock(pipe);
+out_free:
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
pipe_unlock(pipe);
-out:
kvfree(bufs);
return ret;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 070aad543382..41065901106b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
* in the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
-void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- get_page(buf->page);
+ return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
diff --git a/fs/splice.c b/fs/splice.c
index 3ee7e82df48f..98943d9b219c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1593,7 +1593,11 @@ retry:
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
*obuf = *ibuf;
/*
@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
- pipe_buf_get(ipipe, ibuf);
+ if (!pipe_buf_get(ipipe, ibuf)) {
+ if (ret == 0)
+ ret = -EFAULT;
+ break;
+ }
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76769749b5a5..6b10c21630f5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
+/* 127: arbitrary random number, small enough to assemble well */
+#define page_ref_zero_or_close_to_overflow(page) \
+ ((unsigned int) page_ref_count(page) + 127u <= 127u)
+
static inline void get_page(struct page *page)
{
page = compound_head(page);
@@ -973,8 +977,17 @@ static inline void get_page(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
*/
- VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
+ VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
+ page_ref_inc(page);
+}
+
+static inline __must_check bool try_get_page(struct page *page)
+{
+ page = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+ return false;
page_ref_inc(page);
+ return true;
}
static inline void put_page(struct page *page)
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 787d224ff43e..abb2dac3da9b 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -101,18 +101,20 @@ struct pipe_buf_operations {
/*
* Get a reference to the pipe buffer.
*/
- void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
+ bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
/**
* pipe_buf_get - get a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to get a reference to
+ *
+ * Return: %true if the reference was successfully obtained.
*/
-static inline void pipe_buf_get(struct pipe_inode_info *pipe,
+static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- buf->ops->get(pipe, buf);
+ return buf->ops->get(pipe, buf);
}
/**
@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);
/* Generic pipe buffer ops functions */
-void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
+bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..6c24755655c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+ if (ref->ref > INT_MAX/2)
+ return false;
+
ref->ref++;
+ return true;
}
/* Pipe buffer operations for a buffer. */
diff --git a/mm/gup.c b/mm/gup.c
index f84e22685aaa..91819b8ad9cc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -160,8 +160,12 @@ retry:
goto retry;
}
- if (flags & FOLL_GET)
- get_page(page);
+ if (flags & FOLL_GET) {
+ if (unlikely(!try_get_page(page))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -298,7 +302,10 @@ retry_locked:
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
- get_page(page);
+ if (unlikely(!try_get_page(page))) {
+ spin_unlock(ptl);
+ return ERR_PTR(-ENOMEM);
+ }
spin_unlock(ptl);
lock_page(page);
ret = split_huge_page(page);
@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if (is_device_public_page(*page))
goto unmap;
}
- get_page(*page);
+ if (unlikely(!try_get_page(*page))) {
+ ret = -ENOMEM;
+ goto unmap;
+ }
out:
ret = 0;
unmap:
@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
+/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ head = try_get_compound_head(page, 1);
+ if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pmd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pmd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pud_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pud_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pgd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pgd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97b1e0290c66..6cdc7b2d9100 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * Instead of doing 'try_get_page()' below in the same_page
+ * loop, just check the count once here.
+ */
+ if (unlikely(page_count(page) <= 0)) {
+ if (pages) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);