summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c291
1 files changed, 197 insertions, 94 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 596ac98051c5..0789a762ce2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
return ret;
}
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
+ return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+ struct rb_node *rb = rb_next(&se->rb_node);
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
int err = 0;
/* Do not discard the swap header page! */
- se = &si->first_swap_extent;
+ se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
cond_resched();
}
- list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ for (se = next_se(se); se; se = next_se(se)) {
start_block = se->start_block << (PAGE_SHIFT - 9);
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
return err; /* That will often be -EOPNOTSUPP */
}
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+ struct swap_extent *se;
+ struct rb_node *rb;
+
+ rb = sis->swap_extent_root.rb_node;
+ while (rb) {
+ se = rb_entry(rb, struct swap_extent, rb_node);
+ if (offset < se->start_page)
+ rb = rb->rb_left;
+ else if (offset >= se->start_page + se->nr_pages)
+ rb = rb->rb_right;
+ else
+ return se;
+ }
+ /* It *must* be present */
+ BUG();
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
static void discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages)
{
- struct swap_extent *se = si->curr_swap_extent;
- int found_extent = 0;
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
while (nr_pages) {
- if (se->start_page <= start_page &&
- start_page < se->start_page + se->nr_pages) {
- pgoff_t offset = start_page - se->start_page;
- sector_t start_block = se->start_block + offset;
- sector_t nr_blocks = se->nr_pages - offset;
-
- if (nr_blocks > nr_pages)
- nr_blocks = nr_pages;
- start_page += nr_blocks;
- nr_pages -= nr_blocks;
-
- if (!found_extent++)
- si->curr_swap_extent = se;
-
- start_block <<= PAGE_SHIFT - 9;
- nr_blocks <<= PAGE_SHIFT - 9;
- if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
- break;
- }
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO, 0))
+ break;
- se = list_next_entry(se, list);
+ se = next_se(se);
}
}
@@ -1079,12 +1104,11 @@ fail:
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
- unsigned long offset, type;
+ unsigned long offset;
if (!entry.val)
goto out;
- type = swp_type(entry);
- p = swap_type_to_swap_info(type);
+ p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
if (!(p->flags & SWP_USED))
@@ -1187,6 +1211,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
return usage;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc. The caller must
+ * be prepared for that. For example, the following situation is
+ * possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long offset;
+
+ if (!entry.val)
+ goto out;
+ si = swp_swap_info(entry);
+ if (!si)
+ goto bad_nofile;
+
+ rcu_read_lock();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ rcu_read_unlock();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1358,11 +1445,18 @@ int page_swapcount(struct page *page)
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1481,11 @@ int __swp_swapcount(swp_entry_t entry)
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1684,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
return type;
}
if (bdev == sis->bdev) {
- struct swap_extent *se = &sis->first_swap_extent;
+ struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
if (bdev_p)
@@ -2161,7 +2257,6 @@ static void drain_mmlist(void)
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
struct swap_info_struct *sis;
- struct swap_extent *start_se;
struct swap_extent *se;
pgoff_t offset;
@@ -2169,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
*bdev = sis->bdev;
offset = swp_offset(entry);
- start_se = sis->curr_swap_extent;
- se = start_se;
-
- for ( ; ; ) {
- if (se->start_page <= offset &&
- offset < (se->start_page + se->nr_pages)) {
- return se->start_block + (offset - se->start_page);
- }
- se = list_next_entry(se, list);
- sis->curr_swap_extent = se;
- BUG_ON(se == start_se); /* It *must* be present */
- }
+ se = offset_to_swap_extent(sis, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -2198,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->first_swap_extent.list)) {
- struct swap_extent *se;
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
- se = list_first_entry(&sis->first_swap_extent.list,
- struct swap_extent, list);
- list_del(&se->list);
+ rb_erase(rb, &sis->swap_extent_root);
kfree(se);
}
@@ -2219,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
/*
* Add a block range (and the corresponding page range) into this swapdev's
- * extent list. The extent list is kept sorted in page order.
+ * extent tree.
*
* This function rather assumes that it is called in ascending page order.
*/
@@ -2227,20 +2311,21 @@ int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
struct swap_extent *se;
struct swap_extent *new_se;
- struct list_head *lh;
-
- if (start_page == 0) {
- se = &sis->first_swap_extent;
- sis->curr_swap_extent = se;
- se->start_page = 0;
- se->nr_pages = nr_pages;
- se->start_block = start_block;
- return 1;
- } else {
- lh = sis->first_swap_extent.list.prev; /* Highest extent */
- se = list_entry(lh, struct swap_extent, list);
+
+ /*
+ * place the new node at the right most since the
+ * function is called in ascending page order.
+ */
+ while (*link) {
+ parent = *link;
+ link = &parent->rb_right;
+ }
+
+ if (parent) {
+ se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
/* Merge it */
@@ -2249,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
}
}
- /*
- * No merge. Insert a new extent, preserving ordering.
- */
+ /* No merge, insert a new extent. */
new_se = kmalloc(sizeof(*se), GFP_KERNEL);
if (new_se == NULL)
return -ENOMEM;
@@ -2259,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
+ rb_link_node(&new_se->rb_node, parent, link);
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2335,9 +2419,9 @@ static int swap_node(struct swap_info_struct *p)
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2362,7 +2446,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2389,7 +2477,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are valid
+ * between get/put_swap_device() if SWP_VALID bit is set
+ */
+ synchronize_rcu();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2398,7 +2496,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2501,6 +2600,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ synchronize_rcu();
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -2749,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
* would be relying on p->type to remain valid.
*/
}
- INIT_LIST_HEAD(&p->first_swap_extent.list);
+ p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
for_each_node(i)
plist_node_init(&p->avail_lists[i], 0);
@@ -3265,17 +3375,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
- goto out;
-
- p = swp_swap_info(entry);
+ p = get_swap_device(entry);
if (!p)
- goto bad_file;
-
- offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
goto out;
+ offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3321,11 +3425,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3417,6 +3519,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3527,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3450,9 +3553,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3504,10 +3606,11 @@ out_unlock_cont:
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*