summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-06-06 12:53:05 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2011-06-06 12:53:05 +1000
commit524fa7456aded7ab7948971ee6aac2f706cd107a (patch)
tree6d0a3848370aef3eafe46378fb9ff436f3cd0053
parentc39400fbad8acaa1a54d36234299a37857b661d0 (diff)
parentd5fb3162b7b469edeaf1c596585ef593f1995a85 (diff)
Merge remote-tracking branch 'tmem/linux-next'
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-frontswap16
-rw-r--r--Documentation/vm/frontswap.txt210
-rw-r--r--drivers/xen/Kconfig4
-rw-r--r--drivers/xen/Makefile2
-rw-r--r--drivers/xen/tmem.c158
-rw-r--r--include/linux/frontswap.h92
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/swapfile.h13
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile1
-rw-r--r--mm/frontswap.c335
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/swapfile.c65
13 files changed, 903 insertions, 23 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-frontswap b/Documentation/ABI/testing/sysfs-kernel-mm-frontswap
new file mode 100644
index 000000000000..215541ebdb7f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-frontswap
@@ -0,0 +1,16 @@
+What: /sys/kernel/mm/frontswap/
+Date: June 2010
+Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
+Description:
+ /sys/kernel/mm/frontswap/ contains a number of files which
+ record a count of various frontswap operations (sum across
+ all swap devices):
+ succ_puts
+ failed_puts
+ gets
+ flushes
+ In addition, reading the curr_pages file shows how many
+ pages are currently contained in frontswap and writing this
+ file with an integer performs a "partial swapoff", reducing
+ the number of frontswap pages to that integer if memory
+ constraints permit.
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt
new file mode 100644
index 000000000000..76b0f79ca877
--- /dev/null
+++ b/Documentation/vm/frontswap.txt
@@ -0,0 +1,210 @@
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device. The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size. The "device"
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type"). A "put_page" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "get_page" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from from transcendent memory. A "flush_page" will remove the page from
+transcendent memory and a "flush_area" will remove ALL pages associated
+with the swap type (e.g., like swapoff) and notify the "device" to refuse
+further puts with that swap type.
+
+Once a page is successfully put, a matching get on the page will always
+succeed. So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap. If the put returns
+non-zero, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a put returns zero, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+Note that if a page is put and the page already exists in transcendent memory
+(a "duplicate" put), either the put succeeds and the data is overwritten,
+or the put fails AND the page is flushed. This ensures stale data may
+never be obtained from psuedo-RAM.
+
+Monitoring and control of frontswap is done by sysfs files in the
+/sys/kernel/mm/frontswap directory. The effectiveness of frontswap can
+be measured (across all swap devices) with:
+
+curr_pages - number of pages currently contained in frontswap
+failed_puts - how many put attempts have failed
+gets - how many gets were attempted (all should succeed)
+succ_puts - how many put attempts have succeeded
+flushes - how many flushes were attempted
+
+The number can be reduced by root by writing an integer target to curr_pages,
+which results in a "partial swapoff", thus reducing the number of frontswap
+pages to that target if memory constraints permit.
+
+FAQ
+
+1) Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent" memory that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices). Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
+cleancache) interface to transcendent memory provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources acrosst the varying demands of multiple
+virtual machines. This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads). Frontswap -- and cleancache --
+with a fairly small impact on the kernel, provides a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization. And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM if overall host system memory
+conditions allow.
+
+2) Sure there may be performance advantages in some situations, but
+ what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written. If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "put"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd. This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd. (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.) For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+3) OK, how about a quick overview of what this frontswap patch does
+ in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel. Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "put" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_put_page is called. Frontswap
+consults with the frontswap backend and if the backend says
+it does NOT have room, frontswap_put_page returns 0 and the page is
+swapped as normal. Note that the response from the frontswap
+backend is essentially random; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page. But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data. In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_get_page() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend. If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete. If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+put" and (possibly) a "frontswap backend get", which are presumably much
+faster.
+
+4) Can't frontswap be configured as a "special" swap device that is
+ just higher priority than any real swap device (e.g. like zswap)?
+
+No. Recall that acceptance of any swap page by the frontswap
+backend is entirely unpredictable. This is critical to the definition
+of frontswap because it grants completely dynamic discretion to the
+backend. But since any "put" might fail, there must always be a real
+slot on a real swap device to swap the page. Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.
+On the downside, this also means that frontswap cannot contain more
+pages than the total of swapon'd swap devices. For example, if NO
+swap device is configured on some installation, frontswap is useless.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O. The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.
+Synchrony is required to ensure the dynamicity of the backend.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit". For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+5) Why this weird definition about "duplicate puts"? If a page
+ has been previously successfully put, can't it always be
+ successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot. Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K. Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K. But the backend
+has no more space. In this case, the put must be rejected. Whenever
+frontswap rejects a put that would overwrite, it also must flush
+the old data and ensure that it is no longer accessible. Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+6) What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space. But if frontswap has placed a page in transcendent memory, that
+page may be taking up valuable real estate. The frontswap_shrink
+routine allows a process outside of the swap subsystem (such as
+a userland service via the sysfs interface, or a kernel thread)
+to force pages out of the memory managed by frontswap and back into
+kernel-addressable memory.
+
+7) Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global. This seemed a reasonable compromise: Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated May 27, 2011
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a59638b37c1a..bfac8bf7315f 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -105,4 +105,8 @@ config SWIOTLB_XEN
depends on PCI
select SWIOTLB
+config XEN_TMEM
+ bool
+ default y if (CLEANCACHE || FRONTSWAP)
+
endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index bbc18258ecc5..7c1bc0a4d9f4 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,6 +1,5 @@
obj-y += grant-table.o features.o events.o manage.o balloon.o
obj-y += xenbus/
-obj-y += tmem.o
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_features.o := $(nostackp)
@@ -15,6 +14,7 @@ obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
+obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 816a44959ef0..ad6a8f470943 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/cleancache.h>
+#include <linux/frontswap.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
@@ -122,13 +123,6 @@ static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
}
-static int xen_tmem_destroy_pool(u32 pool_id)
-{
- struct tmem_oid oid = { { 0 } };
-
- return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
-}
-
int tmem_enabled;
static int __init enable_tmem(char *s)
@@ -139,6 +133,14 @@ static int __init enable_tmem(char *s)
__setup("tmem", enable_tmem);
+#ifdef CONFIG_CLEANCACHE
+static int xen_tmem_destroy_pool(u32 pool_id)
+{
+ struct tmem_oid oid = { { 0 } };
+
+ return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
+}
+
/* cleancache ops */
static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
@@ -240,18 +242,154 @@ static struct cleancache_ops tmem_cleancache_ops = {
.init_shared_fs = tmem_cleancache_init_shared_fs,
.init_fs = tmem_cleancache_init_fs
};
+#endif
-static int __init xen_tmem_init(void)
+#ifdef CONFIG_FRONTSWAP
+/* frontswap tmem operations */
+
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int tmem_frontswap_poolid;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS 4
+#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind) (_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
+{
+ struct tmem_oid oid = { .oid = { 0 } };
+ oid.oid[0] = _oswiz(type,ind);
+ return oid;
+}
+
+/* returns 0 if the page was successfully put into frontswap, -1 if not */
+static int tmem_frontswap_put_page(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ int pool = tmem_frontswap_poolid;
+ int ret;
+
+ if (pool < 0)
+ return -1;
+ if (ind64 != ind)
+ return -1;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+ /* translate Xen tmem return values to linux semantics */
+ if (ret == 1)
+ return 0;
+ else
+ return -1;
+}
+
+/* returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!) */
+static int tmem_frontswap_get_page(unsigned type, pgoff_t offset,
+ struct page *page)
{
- struct cleancache_ops old_ops;
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long pfn = page_to_pfn(page);
+ int pool = tmem_frontswap_poolid;
+ int ret;
+ if (pool < 0)
+ return -1;
+ if (ind64 != ind)
+ return -1;
+ ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+ /* translate Xen tmem return values to linux semantics */
+ if (ret == 1)
+ return 0;
+ else
+ return -1;
+}
+
+/* flush a single page from frontswap */
+static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ int pool = tmem_frontswap_poolid;
+
+ if (pool < 0)
+ return;
+ if (ind64 != ind)
+ return;
+ (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void tmem_frontswap_flush_area(unsigned type)
+{
+ int pool = tmem_frontswap_poolid;
+ int ind;
+
+ if (pool < 0)
+ return;
+ for (ind = SWIZ_MASK; ind >= 0; ind--)
+ (void)xen_tmem_flush_object(pool, oswiz(type, ind));
+}
+
+static void tmem_frontswap_init(unsigned ignored)
+{
+ struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID;
+
+ /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+ if (tmem_frontswap_poolid < 0)
+ tmem_frontswap_poolid =
+ xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE);
+}
+
+static int use_frontswap = 1;
+
+static int __init no_frontswap(char *s)
+{
+ use_frontswap = 0;
+ return 1;
+}
+
+__setup("nofrontswap", no_frontswap);
+
+static struct frontswap_ops tmem_frontswap_ops = {
+ .put_page = tmem_frontswap_put_page,
+ .get_page = tmem_frontswap_get_page,
+ .flush_page = tmem_frontswap_flush_page,
+ .flush_area = tmem_frontswap_flush_area,
+ .init = tmem_frontswap_init
+};
+#endif
+
+static int __init xen_tmem_init(void)
+{
if (!xen_domain())
return 0;
+#ifdef CONFIG_FRONTSWAP
+ if (tmem_enabled && use_frontswap) {
+ char *s = "";
+ struct frontswap_ops old_ops =
+ frontswap_register_ops(&tmem_frontswap_ops);
+
+ tmem_frontswap_poolid = -1;
+ if (old_ops.init != NULL)
+ s = " (WARNING: frontswap_ops overridden)";
+ printk(KERN_INFO "frontswap enabled, RAM provided by "
+ "Xen Transcendent Memory\n");
+ }
+#endif
#ifdef CONFIG_CLEANCACHE
BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
if (tmem_enabled && use_cleancache) {
char *s = "";
- old_ops = cleancache_register_ops(&tmem_cleancache_ops);
+ struct cleancache_ops old_ops =
+ cleancache_register_ops(&tmem_cleancache_ops);
if (old_ops.init_fs != NULL)
s = " (WARNING: cleancache_ops overridden)";
printk(KERN_INFO "cleancache enabled, RAM provided by "
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
new file mode 100644
index 000000000000..7c62423c67de
--- /dev/null
+++ b/include/linux/frontswap.h
@@ -0,0 +1,92 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+
+struct frontswap_ops {
+ void (*init)(unsigned);
+ int (*put_page)(unsigned, pgoff_t, struct page *);
+ int (*get_page)(unsigned, pgoff_t, struct page *);
+ void (*flush_page)(unsigned, pgoff_t);
+ void (*flush_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+ frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_flush_page(unsigned, pgoff_t);
+extern void __frontswap_flush_area(unsigned);
+
+#ifndef CONFIG_FRONTSWAP
+/* all inline routines become no-ops and all externs are ignored */
+#define frontswap_enabled (0)
+#endif
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+ int ret = 0;
+
+ if (frontswap_enabled && sis->frontswap_map)
+ ret = test_bit(offset % BITS_PER_LONG,
+ &sis->frontswap_map[offset/BITS_PER_LONG]);
+ return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+ if (frontswap_enabled && sis->frontswap_map)
+ set_bit(offset % BITS_PER_LONG,
+ &sis->frontswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ if (frontswap_enabled && sis->frontswap_map)
+ clear_bit(offset % BITS_PER_LONG,
+ &sis->frontswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline int frontswap_put_page(struct page *page)
+{
+ int ret = -1;
+
+ if (frontswap_enabled)
+ ret = __frontswap_put_page(page);
+ return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+ int ret = -1;
+
+ if (frontswap_enabled)
+ ret = __frontswap_get_page(page);
+ return ret;
+}
+
+static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+ if (frontswap_enabled)
+ __frontswap_flush_page(type, offset);
+}
+
+static inline void frontswap_flush_area(unsigned type)
+{
+ if (frontswap_enabled)
+ __frontswap_flush_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+ if (frontswap_enabled)
+ __frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 384eb5fe530b..5d7483cfa159 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -194,6 +194,8 @@ struct swap_info_struct {
struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */
+ unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
+ unsigned int frontswap_pages; /* frontswap pages in-use counter */
};
struct swap_list_t {
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
new file mode 100644
index 000000000000..e282624e8c10
--- /dev/null
+++ b/include/linux/swapfile.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_SWAPFILE_H
+#define _LINUX_SWAPFILE_H
+
+/*
+ * these were static in swapfile.c but frontswap.c needs them and we don't
+ * want to expose them to the dozens of source files that include swap.h
+ */
+extern spinlock_t swap_lock;
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct *swap_info[];
+extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+#endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c8..115c396cb083 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -370,3 +370,19 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap pseudo-RAM driver to cache swap pages"
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite of
+ a "backing" store for a swap device. The storage is assumed to be
+ a synchronous concurrency-safe page-oriented pseudo-RAM device (such
+ as Xen's Transcendent Memory, aka "tmem") which is not directly
+ accessible or addressable by the kernel and is of unknown (and
+ possibly time-varying) size. When a pseudo-RAM device is available,
+ a signficant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..72c9e4fdde00 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..22f866e9e05a
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,335 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/* useful stats available in /sys/kernel/mm/frontswap */
+static unsigned long frontswap_gets;
+static unsigned long frontswap_succ_puts;
+static unsigned long frontswap_failed_puts;
+static unsigned long frontswap_flushes;
+
+/*
+ * register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = 1;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+ if (frontswap_enabled)
+ (*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data
+ * and return success or flush the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = (*frontswap_ops.put_page)(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ frontswap_succ_puts++;
+ if (!dup)
+ sis->frontswap_pages++;
+ } else if (dup) {
+ /*
+ failed dup always results in automatic flush of
+ the (older) page from frontswap
+ */
+ frontswap_clear(sis, offset);
+ sis->frontswap_pages--;
+ frontswap_failed_puts++;
+ } else
+ frontswap_failed_puts++;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ if (frontswap_test(sis, offset))
+ ret = (*frontswap_ops.get_page)(type, offset, page);
+ if (ret == 0)
+ frontswap_gets++;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Flush any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (frontswap_test(sis, offset)) {
+ (*frontswap_ops.flush_page)(type, offset);
+ sis->frontswap_pages--;
+ frontswap_clear(sis, offset);
+ frontswap_flushes++;
+ }
+}
+EXPORT_SYMBOL(__frontswap_flush_page);
+
+/*
+ * Flush all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_flush_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ (*frontswap_ops.flush_area)(type);
+ sis->frontswap_pages = 0;
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_flush_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ int wrapped = 0;
+ bool locked = false;
+
+ for (wrapped = 0; wrapped <= 3; wrapped++) {
+
+ struct swap_info_struct *si = NULL;
+ unsigned long total_pages = 0, total_pages_to_unuse;
+ unsigned long pages = 0, unuse_pages = 0;
+ int type;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ locked = true;
+ total_pages = 0;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ total_pages += si->frontswap_pages;
+ }
+ if (total_pages <= target_pages)
+ goto out;
+ total_pages_to_unuse = total_pages - target_pages;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ if (total_pages_to_unuse < si->frontswap_pages)
+ pages = unuse_pages = total_pages_to_unuse;
+ else {
+ pages = si->frontswap_pages;
+ unuse_pages = 0; /* unuse all */
+ }
+ if (security_vm_enough_memory_kern(pages))
+ continue;
+ vm_unacct_memory(pages);
+ break;
+ }
+ if (type < 0)
+ goto out;
+ locked = false;
+ spin_unlock(&swap_lock);
+ try_to_unuse(type, true, unuse_pages);
+ }
+
+out:
+ if (locked)
+ spin_unlock(&swap_lock);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * count and return the number of pages frontswap pages across all
+ * swap devices. This is exported so that a kernel module can
+ * determine current usage without reading sysfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ spin_lock(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += si->frontswap_pages;
+ }
+ spin_unlock(&swap_lock);
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
+
+#define FRONTSWAP_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define FRONTSWAP_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t curr_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", frontswap_curr_pages());
+}
+
+static ssize_t curr_pages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long target_pages;
+ int err;
+
+ err = strict_strtoul(buf, 10, &target_pages);
+ if (err)
+ return -EINVAL;
+
+ frontswap_shrink(target_pages);
+
+ return count;
+}
+FRONTSWAP_ATTR(curr_pages);
+
+static ssize_t succ_puts_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", frontswap_succ_puts);
+}
+FRONTSWAP_ATTR_RO(succ_puts);
+
+static ssize_t failed_puts_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", frontswap_failed_puts);
+}
+FRONTSWAP_ATTR_RO(failed_puts);
+
+static ssize_t gets_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", frontswap_gets);
+}
+FRONTSWAP_ATTR_RO(gets);
+
+static ssize_t flushes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", frontswap_flushes);
+}
+FRONTSWAP_ATTR_RO(flushes);
+
+static struct attribute *frontswap_attrs[] = {
+ &curr_pages_attr.attr,
+ &succ_puts_attr.attr,
+ &failed_puts_attr.attr,
+ &gets_attr.attr,
+ &flushes_attr.attr,
+ NULL,
+};
+
+static struct attribute_group frontswap_attr_group = {
+ .attrs = frontswap_attrs,
+ .name = "frontswap",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_SYSFS
+ int err;
+
+ err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
+#endif /* CONFIG_SYSFS */
+ return 0;
+}
+
+static void __exit exit_frontswap(void)
+{
+ frontswap_shrink(0UL);
+}
+
+module_init(init_frontswap);
+module_exit(exit_frontswap);
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..651a91259317 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ if (frontswap_put_page(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+ if (frontswap_get_page(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7b..77d8ac505152 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -32,6 +32,8 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -43,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -54,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -557,6 +559,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ frontswap_flush_page(p->type, offset);
if ((p->flags & SWP_BLKDEV) &&
disk->fops->swap_slot_free_notify)
disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -1022,7 +1025,7 @@ static int unuse_mm(struct mm_struct *mm,
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -1048,6 +1051,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
@@ -1059,8 +1068,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -1093,7 +1106,7 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1260,6 +1273,10 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
@@ -1519,7 +1536,8 @@ bad_bmap:
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map)
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
{
int i, prev;
@@ -1529,6 +1547,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ p->frontswap_map = frontswap_map;
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
@@ -1545,6 +1564,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
@@ -1616,7 +1636,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
- err = try_to_unuse(type);
+ err = try_to_unuse(type, false, 0);
test_set_oom_score_adj(oom_score_adj);
if (err) {
@@ -1627,7 +1647,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map);
+ enable_swap_info(p, p->prio, p->swap_map, p->frontswap_map);
goto out_dput;
}
@@ -1653,9 +1673,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ frontswap_flush_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ if (p->frontswap_map)
+ vfree(p->frontswap_map);
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -2028,6 +2051,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2108,6 +2132,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled) {
+ frontswap_map = vmalloc(maxpages / sizeof(long));
+ if (frontswap_map)
+ memset(frontswap_map, 0, maxpages / sizeof(long));
+ }
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2123,14 +2153,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "");
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (p->frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
@@ -2321,6 +2352,10 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
base++;
spin_lock(&swap_lock);
+ if (frontswap_test(si, target)) {
+ spin_unlock(&swap_lock);
+ return 0;
+ }
if (end > si->max) /* don't go beyond end of map */
end = si->max;
@@ -2331,6 +2366,9 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
break;
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
+ /* Don't read in frontswap pages */
+ if (frontswap_test(si, toff))
+ break;
}
/* Count contiguous allocated slots below our target */
for (toff = target; --toff >= base; nr_pages++) {
@@ -2339,6 +2377,9 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
break;
if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
break;
+ /* Don't read in frontswap pages */
+ if (frontswap_test(si, toff))
+ break;
}
spin_unlock(&swap_lock);