summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt9
-rw-r--r--Documentation/vm/frontswap.txt278
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/arm/mach-shmobile/Kconfig6
-rw-r--r--arch/avr32/kernel/signal.c2
-rw-r--r--arch/blackfin/kernel/process.c2
-rw-r--r--arch/x86/include/asm/uaccess.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c145
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c9
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/lib/usercopy.c4
-rw-r--r--arch/x86/lib/x86-opcode-map.txt8
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk14
-rw-r--r--arch/xtensa/include/asm/syscall.h4
-rw-r--r--arch/xtensa/kernel/signal.c2
-rw-r--r--drivers/clocksource/Makefile1
-rw-r--r--drivers/clocksource/em_sti.c406
-rw-r--r--drivers/staging/ramster/zcache-main.c8
-rw-r--r--drivers/staging/zcache/zcache-main.c10
-rw-r--r--drivers/xen/tmem.c8
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/cifssmb.c8
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/file.c106
-rw-r--r--fs/cifs/misc.c89
-rw-r--r--fs/cifs/smb1ops.c89
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/fuse/control.c10
-rw-r--r--fs/fuse/dir.c11
-rw-r--r--fs/fuse/file.c40
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/proc/base.c17
-rw-r--r--include/linux/clockchips.h1
-rw-r--r--include/linux/frontswap.h127
-rw-r--r--include/linux/fuse.h14
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/perf_event.h4
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/swapfile.h13
-rw-r--r--kernel/events/core.c1
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c39
-rw-r--r--kernel/irq/migration.c13
-rw-r--r--kernel/sched/core.c68
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/rt.c51
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/frontswap.c314
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/swapfile.c54
-rw-r--r--tools/perf/MANIFEST2
-rw-r--r--tools/perf/builtin-report.c4
-rw-r--r--tools/perf/builtin-stat.c8
-rw-r--r--tools/perf/builtin-top.c2
-rw-r--r--tools/perf/design.txt7
-rw-r--r--tools/perf/ui/browsers/annotate.c2
-rwxr-xr-xtools/perf/util/PERF-VERSION-GEN2
-rw-r--r--tools/perf/util/callchain.c2
-rw-r--r--tools/perf/util/callchain.h2
-rw-r--r--tools/perf/util/evlist.c17
-rw-r--r--tools/perf/util/evlist.h4
-rw-r--r--tools/perf/util/evsel.c29
-rw-r--r--tools/perf/util/hist.c7
-rw-r--r--tools/perf/util/hist.h2
-rw-r--r--tools/perf/util/pager.c4
-rw-r--r--tools/perf/util/probe-event.c8
-rw-r--r--tools/perf/util/session.c97
-rw-r--r--tools/perf/util/symbol.c38
-rw-r--r--tools/perf/util/symbol.h30
80 files changed, 2059 insertions, 409 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c45513d806ab..a92c5ebf373e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2543,6 +2543,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
sched_debug [KNL] Enables verbose scheduler debug messages.
+ skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
+ xtime_lock contention on larger systems, and/or RCU lock
+ contention on all systems with CONFIG_MAXSMP set.
+ Format: { "0" | "1" }
+ 0 -- disable. (may be 1 via CONFIG_CMDLINE="skew_tick=1"
+ 1 -- enable.
+ Note: increases power consumption, thus should only be
+ enabled if running jitter sensitive (HPC/RT) workloads.
+
security= [SECURITY] Choose a security module to enable at boot.
If this boot parameter is not specified, only the first
security module asking for security registration will be
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt
new file mode 100644
index 000000000000..37067cf455f4
--- /dev/null
+++ b/Documentation/vm/frontswap.txt
@@ -0,0 +1,278 @@
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+(Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends"
+and the only necessary changes to the core kernel for transcendent memory;
+all other supporting code -- the "backends" -- is implemented as drivers.
+See the LWN.net article "Transcendent memory in a nutshell" for a detailed
+overview of frontswap and related kernel parts:
+https://lwn.net/Articles/454795/ )
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device. The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size. The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type"). A "store" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "load" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from from transcendent memory. An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further stores with that swap type.
+
+Once a page is successfully stored, a matching load on the page will normally
+succeed. So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap. If the store returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a store returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+If a backend chooses, frontswap can be configured as a "writethrough
+cache" by calling frontswap_writethrough(). In this mode, the reduction
+in swap device writes is lost (and also a non-trivial performance advantage)
+in order to allow the backend to arbitrarily "reclaim" space used to
+store frontswap pages to more completely manage its memory usage.
+
+Note that if a page is stored and the page already exists in transcendent memory
+(a "duplicate" store), either the store succeeds and the data is overwritten,
+or the store fails AND the page is invalidated. This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the /sys/kernel/debug/frontswap directory. The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+failed_stores - how many store attempts have failed
+loads - how many loads were attempted (all should succeed)
+succ_stores - how many store attempts have succeeded
+invalidates - how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+
+1) Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices). Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
+cleancache) interface to transcendent memory provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+Frontswap -- and cleancache -- with a fairly small impact on the kernel,
+provides a huge amount of flexibility for more dynamic, flexible RAM
+utilization in various system configurations:
+
+In the single kernel case, aka "zcache", pages are compressed and
+stored in local memory, thus increasing the total anonymous pages
+that can be safely kept in RAM. Zcache essentially trades off CPU
+cycles used in compression/decompression for better memory utilization.
+Benchmarks have shown little or no impact when memory pressure is
+low while providing a significant performance improvement (25%+)
+on some workloads under high memory pressure.
+
+"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
+support for clustered systems. Frontswap pages are locally compressed
+as in zcache, but then "remotified" to another system's RAM. This
+allows RAM to be dynamically load-balanced back-and-forth as needed,
+i.e. when system A is overcommitted, it can swap to system B, and
+vice versa. RAMster can also be configured as a memory server so
+many servers in a cluster can swap, dynamically as needed, to a single
+server configured with a large amount of RAM... without pre-configuring
+how much of the RAM is available for each of the clients!
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources acrosst the varying demands of multiple
+virtual machines. This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization. And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "selfballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM (if overall host system memory
+conditions allow), thus mitigating the potentially awful performance impact
+of unplanned swapping.
+
+A KVM implementation is underway and has been RFC'ed to lkml. And,
+using frontswap, investigation is also underway on the use of NVM as
+a memory extension technology.
+
+2) Sure there may be performance advantages in some situations, but
+ what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written. If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "store"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd. This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd. (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.) For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+When swap pages are stored in transcendent memory instead of written
+out to disk, there is a side effect that this may create more memory
+pressure that can potentially outweigh the other advantages. A
+backend, such as zcache, must implement policies to carefully (but
+dynamically) manage memory limits to ensure this doesn't happen.
+
+3) OK, how about a quick overview of what this frontswap patch does
+ in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel. Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "store" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_store is called. Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_store returns -1 and the kernel swaps the page
+to the swap device as normal. Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page. But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data. In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_load() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend. If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete. If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+store" and (possibly) a "frontswap backend loads", which are presumably much
+faster.
+
+4) Can't frontswap be configured as a "special" swap device that is
+ just higher priority than any real swap device (e.g. like zswap,
+ or maybe swap-over-nbd/NFS)?
+
+No. First, the existing swap subsystem doesn't allow for any kind of
+swap hierarchy. Perhaps it could be rewritten to accomodate a hierarchy,
+but this would require fairly drastic changes. Even if it were
+rewritten, the existing swap subsystem uses the block I/O layer which
+assumes a swap device is fixed size and any page in it is linearly
+addressable. Frontswap barely touches the existing swap subsystem,
+and works around the constraints of the block I/O subsystem to provide
+a great deal of flexibility and dynamicity.
+
+For example, the acceptance of any swap page by the frontswap backend is
+entirely unpredictable. This is critical to the definition of frontswap
+backends because it grants completely dynamic discretion to the
+backend. In zcache, one cannot know a priori how compressible a page is.
+"Poorly" compressible pages can be rejected, and "poorly" can itself be
+defined dynamically depending on current memory constraints.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O. The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time. Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem. That said, only the initial "store"
+and "load" operations need be synchronous. A separate asynchronous thread
+is free to manipulate the pages stored by frontswap. For example,
+the "remotification" thread in RAMster uses standard asynchronous
+kernel sockets to move compressed frontswap pages to a remote machine.
+Similarly, a KVM guest-side implementation could do in-guest compression
+and use "batched" hypercalls.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit". For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+There is a downside to the transcendent memory specifications for
+frontswap: Since any "store" might fail, there must always be a real
+slot on a real swap device to swap the page. Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all. This means
+that frontswap cannot contain more pages than the total of swapon'd
+swap devices. For example, if NO swap device is configured on some
+installation, frontswap is useless. Swapless portable devices
+can still use frontswap but a backend for such devices must configure
+some kind of "ghost" swap device and ensure that it is never used.
+
+5) Why this weird definition about "duplicate stores"? If a page
+ has been previously successfully stored, can't it always be
+ successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot. Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K. Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K. But the backend
+has no more space. In this case, the store must be rejected. Whenever
+frontswap rejects a store that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible. Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+6) What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space. But if frontswap has placed a page in transcendent memory, that
+page may be taking up valuable real estate. The frontswap_shrink
+routine allows code outside of the swap subsystem to force pages out
+of the memory managed by frontswap and back into kernel-addressable memory.
+For example, in RAMster, a "suction driver" thread will attempt
+to "repatriate" pages sent to a remote machine back to the local machine;
+this is driven using the frontswap_shrink mechanism when memory pressure
+subsides.
+
+7) Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global. This seemed a reasonable compromise: Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated April 9, 2012
diff --git a/MAINTAINERS b/MAINTAINERS
index 55f0fda602ec..6a52bb4a4fc7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2930,6 +2930,13 @@ F: Documentation/power/freezing-of-tasks.txt
F: include/linux/freezer.h
F: kernel/freezer.c
+FRONTSWAP API
+M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+F: mm/frontswap.c
+F: include/linux/frontswap.h
+
FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS
M: David Howells <dhowells@redhat.com>
L: linux-cachefs@redhat.com
diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig
index f31383c32f9c..df33909205e2 100644
--- a/arch/arm/mach-shmobile/Kconfig
+++ b/arch/arm/mach-shmobile/Kconfig
@@ -186,6 +186,12 @@ config SH_TIMER_TMU
help
This enables build of the TMU timer driver.
+config EM_TIMER_STI
+ bool "STI timer driver"
+ default y
+ help
+ This enables build of the STI timer driver.
+
endmenu
config SH_CLK_CPG
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index c140f9b41dce..d552a854dacc 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -300,7 +300,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR)
syscall = 1;
- if (ti->flags & _TIF_SIGPENDING))
+ if (ti->flags & _TIF_SIGPENDING)
do_signal(regs, syscall);
if (ti->flags & _TIF_NOTIFY_RESUME) {
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 2e3994b20169..62bcea7dcc6d 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
unsigned long newsp;
#ifdef __ARCH_SYNC_CORE_DCACHE
- if (current->rt.nr_cpus_allowed == num_possible_cpus())
+ if (current->nr_cpus_allowed == num_possible_cpus())
set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id()));
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 04cd6882308e..e1f3a17034fc 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -33,9 +33,8 @@
#define segment_eq(a, b) ((a).seg == (b).seg)
#define user_addr_max() (current_thread_info()->addr_limit.seg)
-#define __addr_ok(addr) \
- ((unsigned long __force)(addr) < \
- (current_thread_info()->addr_limit.seg))
+#define __addr_ok(addr) \
+ ((unsigned long __force)(addr) < user_addr_max())
/*
* Test whether a block of memory is a valid user space address.
@@ -47,14 +46,14 @@
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/
-#define __range_not_ok(addr, size) \
+#define __range_not_ok(addr, size, limit) \
({ \
unsigned long flag, roksum; \
__chk_user_ptr(addr); \
asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
: "=&r" (flag), "=r" (roksum) \
: "1" (addr), "g" ((long)(size)), \
- "rm" (current_thread_info()->addr_limit.seg)); \
+ "rm" (limit)); \
flag; \
})
@@ -77,7 +76,8 @@
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
-#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
+#define access_ok(type, addr, size) \
+ (likely(__range_not_ok(addr, size, user_addr_max()) == 0))
/*
* The exception table consists of pairs of addresses relative to the
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e049d6da0183..c4706cf9c011 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
if (!cpuc->shared_regs)
goto error;
}
+ cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -1756,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
#ifdef CONFIG_COMPAT
#include <asm/compat.h>
@@ -1780,7 +1787,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (bytes != sizeof(frame))
break;
- if (fp < compat_ptr(regs->sp))
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
@@ -1826,7 +1833,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (bytes != sizeof(frame))
break;
- if ((unsigned long)fp < regs->sp)
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf54493..7241e2fc3c17 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -117,6 +117,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
+ int is_fake;
/*
* Intel DebugStore bits
@@ -364,6 +365,7 @@ struct x86_pmu {
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
/*
* Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 166546ec6aef..187c294bc658 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
{
if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return false;
+ return idx;
- if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ if (idx == EXTRA_REG_RSP_0)
+ return EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ return EXTRA_REG_RSP_0;
+
+ return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
event->hw.config |= 0x01b7;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
-
- if (event->hw.extra_reg.idx == orig_idx)
- return false;
-
- return true;
}
/*
@@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
struct event_constraint *c = &emptyconstraint;
struct er_account *era;
unsigned long flags;
- int orig_idx = reg->idx;
+ int idx = reg->idx;
- /* already allocated shared msr */
- if (reg->alloc)
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
return NULL; /* call x86_get_event_constraint() */
again:
- era = &cpuc->shared_regs->regs[reg->idx];
+ era = &cpuc->shared_regs->regs[idx];
/*
* we use spin_lock_irqsave() to avoid lockdep issues when
* passing a fake cpuc
@@ -1173,6 +1183,29 @@ again:
if (!atomic_read(&era->ref) || era->config == reg->config) {
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
@@ -1180,17 +1213,17 @@ again:
/* one more user */
atomic_inc(&era->ref);
- /* no need to reallocate during incremental event scheduling */
- reg->alloc = 1;
-
/*
* need to call x86_get_event_constraint()
* to check if associated event has constraints
*/
c = NULL;
- } else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
+ } else {
+ idx = intel_alt_er(idx);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
struct er_account *era;
/*
- * only put constraint if extra reg was actually
- * allocated. Also takes care of event which do
- * not use an extra shared reg
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
*/
- if (!reg->alloc)
+ if (!reg->alloc || cpuc->is_fake)
return;
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1300,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_shared_regs_event_constraints(cpuc, event);
}
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip &&
- (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
/*
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
* (0x003c) so that we can use it with PEBS.
@@ -1329,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
if (intel_pmu_needs_lbr_smpl(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -1607,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
.format_attrs = intel_arch3_formats_attr,
@@ -1840,8 +1909,9 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
- x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
+ x86_add_quirk(intel_sandybridge_quirk);
+ case 58: /* IvyBridge */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1849,6 +1919,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5a3edc27f6e5..35e2192df9f4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f56f96da77f5..fd019d78b1f4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index f61ee67ec00f..677b1ed184c9 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <asm/word-at-a-time.h>
+#include <linux/sched.h>
/*
* best effort, GUP based copy_from_user() that is NMI-safe
@@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
void *map;
int ret;
+ if (__range_not_ok(from, n, TASK_SIZE) == 0)
+ return len;
+
do {
ret = __get_user_pages_fast(addr, 1, 0, &page);
if (!ret)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 819137904428..5d7e51f3fd28 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -28,7 +28,7 @@
# - (66): the last prefix is 0x66
# - (F3): the last prefix is 0xF3
# - (F2): the last prefix is 0xF2
-#
+# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
Table: one byte opcode
Referrer:
@@ -515,12 +515,12 @@ b4: LFS Gv,Mp
b5: LGS Gv,Mp
b6: MOVZX Gv,Eb
b7: MOVZX Gv,Ew
-b8: JMPE | POPCNT Gv,Ev (F3)
+b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
b9: Grp10 (1A)
ba: Grp8 Ev,Ib (1A)
bb: BTC Ev,Gv
-bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
-bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
+bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
be: MOVSX Gv,Eb
bf: MOVSX Gv,Ew
# 0x0f 0xc0-0xcf
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index 5f6a5b6c3a15..ddcf39b1a18d 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -66,9 +66,10 @@ BEGIN {
rex_expr = "^REX(\\.[XRWB]+)*"
fpu_expr = "^ESC" # TODO
- lprefix1_expr = "\\(66\\)"
+ lprefix1_expr = "\\((66|!F3)\\)"
lprefix2_expr = "\\(F3\\)"
- lprefix3_expr = "\\(F2\\)"
+ lprefix3_expr = "\\((F2|!F3)\\)"
+ lprefix_expr = "\\((66|F2|F3)\\)"
max_lprefix = 4
# All opcodes starting with lower-case 'v' or with (v1) superscript
@@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod)
if (match(ext, lprefix1_expr)) {
lptable1[idx] = add_flags(lptable1[idx],flags)
variant = "INAT_VARIANT"
- } else if (match(ext, lprefix2_expr)) {
+ }
+ if (match(ext, lprefix2_expr)) {
lptable2[idx] = add_flags(lptable2[idx],flags)
variant = "INAT_VARIANT"
- } else if (match(ext, lprefix3_expr)) {
+ }
+ if (match(ext, lprefix3_expr)) {
lptable3[idx] = add_flags(lptable3[idx],flags)
variant = "INAT_VARIANT"
- } else {
+ }
+ if (!match(ext, lprefix_expr)){
table[idx] = add_flags(table[idx],flags)
}
}
diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h
index 0b9f2e13c781..c1dacca312f3 100644
--- a/arch/xtensa/include/asm/syscall.h
+++ b/arch/xtensa/include/asm/syscall.h
@@ -31,5 +31,5 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec __user *tsp, const sigset_t __user *sigmask,
size_t sigsetsize);
-
-
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset,
+ size_t sigsetsize);
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index b9f8e5850d3a..efe4e854b3cd 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs)
if (ret)
return;
- signal_delivered(signr, info, ka, regs, 0);
+ signal_delivered(signr, &info, &ka, regs, 0);
if (current->ptrace & PT_SINGLESTEP)
task_pt_regs(current)->icountlevel = 1;
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 8d81a1d32653..dd3e661a124d 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o
obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o
obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o
obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o
+obj-$(CONFIG_EM_TIMER_STI) += em_sti.o
obj-$(CONFIG_CLKBLD_I8253) += i8253.o
obj-$(CONFIG_CLKSRC_MMIO) += mmio.o
obj-$(CONFIG_DW_APB_TIMER) += dw_apb_timer.o
diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c
new file mode 100644
index 000000000000..372051d1bba8
--- /dev/null
+++ b/drivers/clocksource/em_sti.c
@@ -0,0 +1,406 @@
+/*
+ * Emma Mobile Timer Support - STI
+ *
+ * Copyright (C) 2012 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+enum { USER_CLOCKSOURCE, USER_CLOCKEVENT, USER_NR };
+
+struct em_sti_priv {
+ void __iomem *base;
+ struct clk *clk;
+ struct platform_device *pdev;
+ unsigned int active[USER_NR];
+ unsigned long rate;
+ raw_spinlock_t lock;
+ struct clock_event_device ced;
+ struct clocksource cs;
+};
+
+#define STI_CONTROL 0x00
+#define STI_COMPA_H 0x10
+#define STI_COMPA_L 0x14
+#define STI_COMPB_H 0x18
+#define STI_COMPB_L 0x1c
+#define STI_COUNT_H 0x20
+#define STI_COUNT_L 0x24
+#define STI_COUNT_RAW_H 0x28
+#define STI_COUNT_RAW_L 0x2c
+#define STI_SET_H 0x30
+#define STI_SET_L 0x34
+#define STI_INTSTATUS 0x40
+#define STI_INTRAWSTATUS 0x44
+#define STI_INTENSET 0x48
+#define STI_INTENCLR 0x4c
+#define STI_INTFFCLR 0x50
+
+static inline unsigned long em_sti_read(struct em_sti_priv *p, int offs)
+{
+ return ioread32(p->base + offs);
+}
+
+static inline void em_sti_write(struct em_sti_priv *p, int offs,
+ unsigned long value)
+{
+ iowrite32(value, p->base + offs);
+}
+
+static int em_sti_enable(struct em_sti_priv *p)
+{
+ int ret;
+
+ /* enable clock */
+ ret = clk_enable(p->clk);
+ if (ret) {
+ dev_err(&p->pdev->dev, "cannot enable clock\n");
+ return ret;
+ }
+
+ /* configure channel, periodic mode and maximum timeout */
+ p->rate = clk_get_rate(p->clk);
+
+ /* reset the counter */
+ em_sti_write(p, STI_SET_H, 0x40000000);
+ em_sti_write(p, STI_SET_L, 0x00000000);
+
+ /* mask and clear pending interrupts */
+ em_sti_write(p, STI_INTENCLR, 3);
+ em_sti_write(p, STI_INTFFCLR, 3);
+
+ /* enable updates of counter registers */
+ em_sti_write(p, STI_CONTROL, 1);
+
+ return 0;
+}
+
+static void em_sti_disable(struct em_sti_priv *p)
+{
+ /* mask interrupts */
+ em_sti_write(p, STI_INTENCLR, 3);
+
+ /* stop clock */
+ clk_disable(p->clk);
+}
+
+static cycle_t em_sti_count(struct em_sti_priv *p)
+{
+ cycle_t ticks;
+ unsigned long flags;
+
+ /* the STI hardware buffers the 48-bit count, but to
+ * break it out into two 32-bit access the registers
+ * must be accessed in a certain order.
+ * Always read STI_COUNT_H before STI_COUNT_L.
+ */
+ raw_spin_lock_irqsave(&p->lock, flags);
+ ticks = (cycle_t)(em_sti_read(p, STI_COUNT_H) & 0xffff) << 32;
+ ticks |= em_sti_read(p, STI_COUNT_L);
+ raw_spin_unlock_irqrestore(&p->lock, flags);
+
+ return ticks;
+}
+
+static cycle_t em_sti_set_next(struct em_sti_priv *p, cycle_t next)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&p->lock, flags);
+
+ /* mask compare A interrupt */
+ em_sti_write(p, STI_INTENCLR, 1);
+
+ /* update compare A value */
+ em_sti_write(p, STI_COMPA_H, next >> 32);
+ em_sti_write(p, STI_COMPA_L, next & 0xffffffff);
+
+ /* clear compare A interrupt source */
+ em_sti_write(p, STI_INTFFCLR, 1);
+
+ /* unmask compare A interrupt */
+ em_sti_write(p, STI_INTENSET, 1);
+
+ raw_spin_unlock_irqrestore(&p->lock, flags);
+
+ return next;
+}
+
+static irqreturn_t em_sti_interrupt(int irq, void *dev_id)
+{
+ struct em_sti_priv *p = dev_id;
+
+ p->ced.event_handler(&p->ced);
+ return IRQ_HANDLED;
+}
+
+static int em_sti_start(struct em_sti_priv *p, unsigned int user)
+{
+ unsigned long flags;
+ int used_before;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&p->lock, flags);
+ used_before = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT];
+ if (!used_before)
+ ret = em_sti_enable(p);
+
+ if (!ret)
+ p->active[user] = 1;
+ raw_spin_unlock_irqrestore(&p->lock, flags);
+
+ return ret;
+}
+
+static void em_sti_stop(struct em_sti_priv *p, unsigned int user)
+{
+ unsigned long flags;
+ int used_before, used_after;
+
+ raw_spin_lock_irqsave(&p->lock, flags);
+ used_before = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT];
+ p->active[user] = 0;
+ used_after = p->active[USER_CLOCKSOURCE] | p->active[USER_CLOCKEVENT];
+
+ if (used_before && !used_after)
+ em_sti_disable(p);
+ raw_spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct em_sti_priv *cs_to_em_sti(struct clocksource *cs)
+{
+ return container_of(cs, struct em_sti_priv, cs);
+}
+
+static cycle_t em_sti_clocksource_read(struct clocksource *cs)
+{
+ return em_sti_count(cs_to_em_sti(cs));
+}
+
+static int em_sti_clocksource_enable(struct clocksource *cs)
+{
+ int ret;
+ struct em_sti_priv *p = cs_to_em_sti(cs);
+
+ ret = em_sti_start(p, USER_CLOCKSOURCE);
+ if (!ret)
+ __clocksource_updatefreq_hz(cs, p->rate);
+ return ret;
+}
+
+static void em_sti_clocksource_disable(struct clocksource *cs)
+{
+ em_sti_stop(cs_to_em_sti(cs), USER_CLOCKSOURCE);
+}
+
+static void em_sti_clocksource_resume(struct clocksource *cs)
+{
+ em_sti_clocksource_enable(cs);
+}
+
+static int em_sti_register_clocksource(struct em_sti_priv *p)
+{
+ struct clocksource *cs = &p->cs;
+
+ memset(cs, 0, sizeof(*cs));
+ cs->name = dev_name(&p->pdev->dev);
+ cs->rating = 200;
+ cs->read = em_sti_clocksource_read;
+ cs->enable = em_sti_clocksource_enable;
+ cs->disable = em_sti_clocksource_disable;
+ cs->suspend = em_sti_clocksource_disable;
+ cs->resume = em_sti_clocksource_resume;
+ cs->mask = CLOCKSOURCE_MASK(48);
+ cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+
+ dev_info(&p->pdev->dev, "used as clock source\n");
+
+ /* Register with dummy 1 Hz value, gets updated in ->enable() */
+ clocksource_register_hz(cs, 1);
+ return 0;
+}
+
+static struct em_sti_priv *ced_to_em_sti(struct clock_event_device *ced)
+{
+ return container_of(ced, struct em_sti_priv, ced);
+}
+
+static void em_sti_clock_event_mode(enum clock_event_mode mode,
+ struct clock_event_device *ced)
+{
+ struct em_sti_priv *p = ced_to_em_sti(ced);
+
+ /* deal with old setting first */
+ switch (ced->mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ em_sti_stop(p, USER_CLOCKEVENT);
+ break;
+ default:
+ break;
+ }
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ dev_info(&p->pdev->dev, "used for oneshot clock events\n");
+ em_sti_start(p, USER_CLOCKEVENT);
+ clockevents_config(&p->ced, p->rate);
+ break;
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ case CLOCK_EVT_MODE_UNUSED:
+ em_sti_stop(p, USER_CLOCKEVENT);
+ break;
+ default:
+ break;
+ }
+}
+
+static int em_sti_clock_event_next(unsigned long delta,
+ struct clock_event_device *ced)
+{
+ struct em_sti_priv *p = ced_to_em_sti(ced);
+ cycle_t next;
+ int safe;
+
+ next = em_sti_set_next(p, em_sti_count(p) + delta);
+ safe = em_sti_count(p) < (next - 1);
+
+ return !safe;
+}
+
+static void em_sti_register_clockevent(struct em_sti_priv *p)
+{
+ struct clock_event_device *ced = &p->ced;
+
+ memset(ced, 0, sizeof(*ced));
+ ced->name = dev_name(&p->pdev->dev);
+ ced->features = CLOCK_EVT_FEAT_ONESHOT;
+ ced->rating = 200;
+ ced->cpumask = cpumask_of(0);
+ ced->set_next_event = em_sti_clock_event_next;
+ ced->set_mode = em_sti_clock_event_mode;
+
+ dev_info(&p->pdev->dev, "used for clock events\n");
+
+ /* Register with dummy 1 Hz value, gets updated in ->set_mode() */
+ clockevents_config_and_register(ced, 1, 2, 0xffffffff);
+}
+
+static int __devinit em_sti_probe(struct platform_device *pdev)
+{
+ struct em_sti_priv *p;
+ struct resource *res;
+ int irq, ret;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p == NULL) {
+ dev_err(&pdev->dev, "failed to allocate driver data\n");
+ ret = -ENOMEM;
+ goto err0;
+ }
+
+ p->pdev = pdev;
+ platform_set_drvdata(pdev, p);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "failed to get I/O memory\n");
+ ret = -EINVAL;
+ goto err0;
+ }
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(&pdev->dev, "failed to get irq\n");
+ ret = -EINVAL;
+ goto err0;
+ }
+
+ /* map memory, let base point to the STI instance */
+ p->base = ioremap_nocache(res->start, resource_size(res));
+ if (p->base == NULL) {
+ dev_err(&pdev->dev, "failed to remap I/O memory\n");
+ ret = -ENXIO;
+ goto err0;
+ }
+
+ /* get hold of clock */
+ p->clk = clk_get(&pdev->dev, "sclk");
+ if (IS_ERR(p->clk)) {
+ dev_err(&pdev->dev, "cannot get clock\n");
+ ret = PTR_ERR(p->clk);
+ goto err1;
+ }
+
+ if (request_irq(irq, em_sti_interrupt,
+ IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
+ dev_name(&pdev->dev), p)) {
+ dev_err(&pdev->dev, "failed to request low IRQ\n");
+ ret = -ENOENT;
+ goto err2;
+ }
+
+ raw_spin_lock_init(&p->lock);
+ em_sti_register_clockevent(p);
+ em_sti_register_clocksource(p);
+ return 0;
+
+err2:
+ clk_put(p->clk);
+err1:
+ iounmap(p->base);
+err0:
+ kfree(p);
+ return ret;
+}
+
+static int __devexit em_sti_remove(struct platform_device *pdev)
+{
+ return -EBUSY; /* cannot unregister clockevent and clocksource */
+}
+
+static const struct of_device_id em_sti_dt_ids[] __devinitconst = {
+ { .compatible = "renesas,em-sti", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, em_sti_dt_ids);
+
+static struct platform_driver em_sti_device_driver = {
+ .probe = em_sti_probe,
+ .remove = __devexit_p(em_sti_remove),
+ .driver = {
+ .name = "em_sti",
+ .of_match_table = em_sti_dt_ids,
+ }
+};
+
+module_platform_driver(em_sti_device_driver);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("Renesas Emma Mobile STI Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c
index 4e7ef0e6b79c..d46764b5aaba 100644
--- a/drivers/staging/ramster/zcache-main.c
+++ b/drivers/staging/ramster/zcache-main.c
@@ -3002,7 +3002,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
return oid;
}
-static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
+static int zcache_frontswap_store(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -3025,7 +3025,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
/* returns 0 if the page was successfully gotten from frontswap, -1 if
* was not present (should never happen!) */
-static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
+static int zcache_frontswap_load(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -3080,8 +3080,8 @@ static void zcache_frontswap_init(unsigned ignored)
}
static struct frontswap_ops zcache_frontswap_ops = {
- .put_page = zcache_frontswap_put_page,
- .get_page = zcache_frontswap_get_page,
+ .store = zcache_frontswap_store,
+ .load = zcache_frontswap_load,
.invalidate_page = zcache_frontswap_flush_page,
.invalidate_area = zcache_frontswap_flush_area,
.init = zcache_frontswap_init
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index 2734dacacbaf..784c796b9848 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -1835,7 +1835,7 @@ static int zcache_frontswap_poolid = -1;
* Swizzling increases objects per swaptype, increasing tmem concurrency
* for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
* Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
- * frontswap_get_page(), but has side-effects. Hence using 8.
+ * frontswap_load(), but has side-effects. Hence using 8.
*/
#define SWIZ_BITS 8
#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
@@ -1849,7 +1849,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
return oid;
}
-static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
+static int zcache_frontswap_store(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -1870,7 +1870,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
/* returns 0 if the page was successfully gotten from frontswap, -1 if
* was not present (should never happen!) */
-static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
+static int zcache_frontswap_load(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -1919,8 +1919,8 @@ static void zcache_frontswap_init(unsigned ignored)
}
static struct frontswap_ops zcache_frontswap_ops = {
- .put_page = zcache_frontswap_put_page,
- .get_page = zcache_frontswap_get_page,
+ .store = zcache_frontswap_store,
+ .load = zcache_frontswap_load,
.invalidate_page = zcache_frontswap_flush_page,
.invalidate_area = zcache_frontswap_flush_area,
.init = zcache_frontswap_init
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index dcb79521e6c8..89f264c67420 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -269,7 +269,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
}
/* returns 0 if the page was successfully put into frontswap, -1 if not */
-static int tmem_frontswap_put_page(unsigned type, pgoff_t offset,
+static int tmem_frontswap_store(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -295,7 +295,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset,
* returns 0 if the page was successfully gotten from frontswap, -1 if
* was not present (should never happen!)
*/
-static int tmem_frontswap_get_page(unsigned type, pgoff_t offset,
+static int tmem_frontswap_load(unsigned type, pgoff_t offset,
struct page *page)
{
u64 ind64 = (u64)offset;
@@ -362,8 +362,8 @@ static int __init no_frontswap(char *s)
__setup("nofrontswap", no_frontswap);
static struct frontswap_ops __initdata tmem_frontswap_ops = {
- .put_page = tmem_frontswap_put_page,
- .get_page = tmem_frontswap_get_page,
+ .store = tmem_frontswap_store,
+ .load = tmem_frontswap_load,
.invalidate_page = tmem_frontswap_flush_page,
.invalidate_area = tmem_frontswap_flush_area,
.init = tmem_frontswap_init
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 20350a93ed99..6df0cbe1cbc9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -174,6 +174,7 @@ struct smb_version_operations {
void (*add_credits)(struct TCP_Server_Info *, const unsigned int);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *);
+ __u64 (*get_next_mid)(struct TCP_Server_Info *);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
/* data length from read response message */
@@ -399,6 +400,12 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
+static inline __u64
+get_next_mid(struct TCP_Server_Info *server)
+{
+ return server->ops->get_next_mid(server);
+}
+
/*
* Macros to allow the TCP_Server_Info->net field and related code to drop out
* when CONFIG_NET_NS isn't set.
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5ec21ecf7980..0a6cbfe2761e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -114,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
void **request_buf);
extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
-extern __u64 GetNextMid(struct TCP_Server_Info *server);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b5ad716b2642..5b400730c213 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -268,7 +268,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
return rc;
buffer = (struct smb_hdr *)*request_buf;
- buffer->Mid = GetNextMid(ses->server);
+ buffer->Mid = get_next_mid(ses->server);
if (ses->capabilities & CAP_UNICODE)
buffer->Flags2 |= SMBFLG2_UNICODE;
if (ses->capabilities & CAP_STATUS32)
@@ -402,7 +402,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
cFYI(1, "secFlags 0x%x", secFlags);
- pSMB->hdr.Mid = GetNextMid(server);
+ pSMB->hdr.Mid = get_next_mid(server);
pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
@@ -782,7 +782,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
return rc;
}
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
if (ses->server->sec_mode &
(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
@@ -4762,7 +4762,7 @@ getDFSRetry:
/* server pointer checked in called function,
but should never be null here anyway */
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
pSMB->hdr.Tid = ses->ipc_tid;
pSMB->hdr.Uid = ses->Suid;
if (ses->capabilities & CAP_STATUS32)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ccafdedd0dbc..78db68a5cf44 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1058,13 +1058,15 @@ cifs_demultiplex_thread(void *p)
if (mid_entry != NULL) {
if (!mid_entry->multiRsp || mid_entry->multiEnd)
mid_entry->callback(mid_entry);
- } else if (!server->ops->is_oplock_break(buf, server)) {
+ } else if (!server->ops->is_oplock_break ||
+ !server->ops->is_oplock_break(buf, server)) {
cERROR(1, "No task to wake, unknown frame received! "
"NumMids %d", atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", buf,
HEADER_SIZE(server));
#ifdef CONFIG_CIFS_DEBUG2
- server->ops->dump_detail(buf);
+ if (server->ops->dump_detail)
+ server->ops->dump_detail(buf);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
@@ -3938,7 +3940,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
NULL /*no tid */ , 4 /*wct */ );
- smb_buffer->Mid = GetNextMid(ses->server);
+ smb_buffer->Mid = get_next_mid(ses->server);
smb_buffer->Uid = ses->Suid;
pSMB = (TCONX_REQ *) smb_buffer;
pSMBr = (TCONX_RSP *) smb_buffer_response;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 253170dfa716..513adbc211d7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -876,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
struct cifsLockInfo *li, *tmp;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
- unsigned int num, max_num;
+ unsigned int num, max_num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
int types[] = {LOCKING_ANDX_LARGE_FILES,
LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
@@ -892,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
return rc;
}
- max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
- sizeof(LOCKING_ANDX_RANGE);
+ /*
+ * Accessing maxBuf is racy with cifs_reconnect - need to store value
+ * and check it for zero before using.
+ */
+ max_buf = tcon->ses->server->maxBuf;
+ if (!max_buf) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return -EINVAL;
+ }
+
+ max_num = (max_buf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
mutex_unlock(&cinode->lock_mutex);
@@ -1218,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
int types[] = {LOCKING_ANDX_LARGE_FILES,
LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
unsigned int i;
- unsigned int max_num, num;
+ unsigned int max_num, num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
@@ -1228,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
INIT_LIST_HEAD(&tmp_llist);
- max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
- sizeof(LOCKING_ANDX_RANGE);
+ /*
+ * Accessing maxBuf is racy with cifs_reconnect - need to store value
+ * and check it for zero before using.
+ */
+ max_buf = tcon->ses->server->maxBuf;
+ if (!max_buf)
+ return -EINVAL;
+
+ max_num = (max_buf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1247,46 +1266,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
continue;
if (types[i] != li->type)
continue;
- if (!cinode->can_cache_brlcks) {
- cur->Pid = cpu_to_le16(li->pid);
- cur->LengthLow = cpu_to_le32((u32)li->length);
- cur->LengthHigh =
- cpu_to_le32((u32)(li->length>>32));
- cur->OffsetLow = cpu_to_le32((u32)li->offset);
- cur->OffsetHigh =
- cpu_to_le32((u32)(li->offset>>32));
- /*
- * We need to save a lock here to let us add
- * it again to the file's list if the unlock
- * range request fails on the server.
- */
- list_move(&li->llist, &tmp_llist);
- if (++num == max_num) {
- stored_rc = cifs_lockv(xid, tcon,
- cfile->netfid,
- li->type, num,
- 0, buf);
- if (stored_rc) {
- /*
- * We failed on the unlock range
- * request - add all locks from
- * the tmp list to the head of
- * the file's list.
- */
- cifs_move_llist(&tmp_llist,
- &cfile->llist);
- rc = stored_rc;
- } else
- /*
- * The unlock range request
- * succeed - free the tmp list.
- */
- cifs_free_llist(&tmp_llist);
- cur = buf;
- num = 0;
- } else
- cur++;
- } else {
+ if (cinode->can_cache_brlcks) {
/*
* We can cache brlock requests - simply remove
* a lock from the file's list.
@@ -1294,7 +1274,41 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
list_del(&li->llist);
cifs_del_lock_waiters(li);
kfree(li);
+ continue;
}
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+ /*
+ * We need to save a lock here to let us add it again to
+ * the file's list if the unlock range request fails on
+ * the server.
+ */
+ list_move(&li->llist, &tmp_llist);
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ li->type, num, 0, buf);
+ if (stored_rc) {
+ /*
+ * We failed on the unlock range
+ * request - add all locks from the tmp
+ * list to the head of the file's list.
+ */
+ cifs_move_llist(&tmp_llist,
+ &cfile->llist);
+ rc = stored_rc;
+ } else
+ /*
+ * The unlock range request succeed -
+ * free the tmp list.
+ */
+ cifs_free_llist(&tmp_llist);
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
}
if (num) {
stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e2552d2b2e42..557506ae1e2a 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free)
return;
}
-/*
- * Find a free multiplex id (SMB mid). Otherwise there could be
- * mid collisions which might cause problems, demultiplexing the
- * wrong response to this request. Multiplex ids could collide if
- * one of a series requests takes much longer than the others, or
- * if a very large number of long lived requests (byte range
- * locks or FindNotify requests) are pending. No more than
- * 64K-1 requests can be outstanding at one time. If no
- * mids are available, return zero. A future optimization
- * could make the combination of mids and uid the key we use
- * to demultiplex on (rather than mid alone).
- * In addition to the above check, the cifs demultiplex
- * code already used the command code as a secondary
- * check of the frame and if signing is negotiated the
- * response would be discarded if the mid were the same
- * but the signature was wrong. Since the mid is not put in the
- * pending queue until later (when it is about to be dispatched)
- * we do have to limit the number of outstanding requests
- * to somewhat less than 64K-1 although it is hard to imagine
- * so many threads being in the vfs at one time.
- */
-__u64 GetNextMid(struct TCP_Server_Info *server)
-{
- __u64 mid = 0;
- __u16 last_mid, cur_mid;
- bool collision;
-
- spin_lock(&GlobalMid_Lock);
-
- /* mid is 16 bit only for CIFS/SMB */
- cur_mid = (__u16)((server->CurrentMid) & 0xffff);
- /* we do not want to loop forever */
- last_mid = cur_mid;
- cur_mid++;
-
- /*
- * This nested loop looks more expensive than it is.
- * In practice the list of pending requests is short,
- * fewer than 50, and the mids are likely to be unique
- * on the first pass through the loop unless some request
- * takes longer than the 64 thousand requests before it
- * (and it would also have to have been a request that
- * did not time out).
- */
- while (cur_mid != last_mid) {
- struct mid_q_entry *mid_entry;
- unsigned int num_mids;
-
- collision = false;
- if (cur_mid == 0)
- cur_mid++;
-
- num_mids = 0;
- list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
- ++num_mids;
- if (mid_entry->mid == cur_mid &&
- mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
- /* This mid is in use, try a different one */
- collision = true;
- break;
- }
- }
-
- /*
- * if we have more than 32k mids in the list, then something
- * is very wrong. Possibly a local user is trying to DoS the
- * box by issuing long-running calls and SIGKILL'ing them. If
- * we get to 2^16 mids then we're in big trouble as this
- * function could loop forever.
- *
- * Go ahead and assign out the mid in this situation, but force
- * an eventual reconnect to clean out the pending_mid_q.
- */
- if (num_mids > 32768)
- server->tcpStatus = CifsNeedReconnect;
-
- if (!collision) {
- mid = (__u64)cur_mid;
- server->CurrentMid = mid;
- break;
- }
- cur_mid++;
- }
- spin_unlock(&GlobalMid_Lock);
- return mid;
-}
-
/* NB: MID can not be set if treeCon not passed in, in that
case it is responsbility of caller to set the mid */
void
@@ -334,7 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
/* Uid is not converted */
buffer->Uid = treeCon->ses->Suid;
- buffer->Mid = GetNextMid(treeCon->ses->server);
+ buffer->Mid = get_next_mid(treeCon->ses->server);
}
if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
buffer->Flags2 |= SMBFLG2_DFS;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d9d615fbed3f..6dec38f5522d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -125,6 +125,94 @@ cifs_get_credits_field(struct TCP_Server_Info *server)
return &server->credits;
}
+/*
+ * Find a free multiplex id (SMB mid). Otherwise there could be
+ * mid collisions which might cause problems, demultiplexing the
+ * wrong response to this request. Multiplex ids could collide if
+ * one of a series requests takes much longer than the others, or
+ * if a very large number of long lived requests (byte range
+ * locks or FindNotify requests) are pending. No more than
+ * 64K-1 requests can be outstanding at one time. If no
+ * mids are available, return zero. A future optimization
+ * could make the combination of mids and uid the key we use
+ * to demultiplex on (rather than mid alone).
+ * In addition to the above check, the cifs demultiplex
+ * code already used the command code as a secondary
+ * check of the frame and if signing is negotiated the
+ * response would be discarded if the mid were the same
+ * but the signature was wrong. Since the mid is not put in the
+ * pending queue until later (when it is about to be dispatched)
+ * we do have to limit the number of outstanding requests
+ * to somewhat less than 64K-1 although it is hard to imagine
+ * so many threads being in the vfs at one time.
+ */
+static __u64
+cifs_get_next_mid(struct TCP_Server_Info *server)
+{
+ __u64 mid = 0;
+ __u16 last_mid, cur_mid;
+ bool collision;
+
+ spin_lock(&GlobalMid_Lock);
+
+ /* mid is 16 bit only for CIFS/SMB */
+ cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+ /* we do not want to loop forever */
+ last_mid = cur_mid;
+ cur_mid++;
+
+ /*
+ * This nested loop looks more expensive than it is.
+ * In practice the list of pending requests is short,
+ * fewer than 50, and the mids are likely to be unique
+ * on the first pass through the loop unless some request
+ * takes longer than the 64 thousand requests before it
+ * (and it would also have to have been a request that
+ * did not time out).
+ */
+ while (cur_mid != last_mid) {
+ struct mid_q_entry *mid_entry;
+ unsigned int num_mids;
+
+ collision = false;
+ if (cur_mid == 0)
+ cur_mid++;
+
+ num_mids = 0;
+ list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+ ++num_mids;
+ if (mid_entry->mid == cur_mid &&
+ mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
+ /* This mid is in use, try a different one */
+ collision = true;
+ break;
+ }
+ }
+
+ /*
+ * if we have more than 32k mids in the list, then something
+ * is very wrong. Possibly a local user is trying to DoS the
+ * box by issuing long-running calls and SIGKILL'ing them. If
+ * we get to 2^16 mids then we're in big trouble as this
+ * function could loop forever.
+ *
+ * Go ahead and assign out the mid in this situation, but force
+ * an eventual reconnect to clean out the pending_mid_q.
+ */
+ if (num_mids > 32768)
+ server->tcpStatus = CifsNeedReconnect;
+
+ if (!collision) {
+ mid = (__u64)cur_mid;
+ server->CurrentMid = mid;
+ break;
+ }
+ cur_mid++;
+ }
+ spin_unlock(&GlobalMid_Lock);
+ return mid;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -133,6 +221,7 @@ struct smb_version_operations smb1_operations = {
.add_credits = cifs_add_credits,
.set_credits = cifs_set_credits,
.get_credits_field = cifs_get_credits_field,
+ .get_next_mid = cifs_get_next_mid,
.read_data_offset = cifs_read_data_offset,
.read_data_length = cifs_read_data_length,
.map_error = map_smb_to_linux_error,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1b36ffe6a47b..3097ee58fd7d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -779,7 +779,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
pSMB->Timeout = 0;
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
return SendReceive(xid, ses, in_buf, out_buf,
&bytes_returned, 0);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 42593c587d48..03ff5b1eba93 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -75,19 +75,13 @@ static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
unsigned global_limit)
{
unsigned long t;
- char tmp[32];
unsigned limit = (1 << 16) - 1;
int err;
- if (*ppos || count >= sizeof(tmp) - 1)
- return -EINVAL;
-
- if (copy_from_user(tmp, buf, count))
+ if (*ppos)
return -EINVAL;
- tmp[count] = '\0';
-
- err = strict_strtoul(tmp, 0, &t);
+ err = kstrtoul_from_user(buf, count, 0, &t);
if (err)
return err;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index df5ac048dc74..334e0b18a014 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -775,6 +775,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
struct kstat *stat)
{
+ unsigned int blkbits;
+
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -790,7 +792,13 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
stat->ctime.tv_nsec = attr->ctimensec;
stat->size = attr->size;
stat->blocks = attr->blocks;
- stat->blksize = (1 << inode->i_blkbits);
+
+ if (attr->blksize != 0)
+ blkbits = ilog2(attr->blksize);
+ else
+ blkbits = inode->i_sb->s_blocksize_bits;
+
+ stat->blksize = 1 << blkbits;
}
static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
@@ -863,6 +871,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
if (stat) {
generic_fillattr(inode, stat);
stat->mode = fi->orig_i_mode;
+ stat->ino = fi->orig_ino;
}
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9562109d3a87..b321a688cde7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2173,6 +2173,44 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
return ret;
}
+long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t length)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_req *req;
+ struct fuse_fallocate_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .length = length,
+ .mode = mode
+ };
+ int err;
+
+ if (fc->no_fallocate)
+ return -EOPNOTSUPP;
+
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->in.h.opcode = FUSE_FALLOCATE;
+ req->in.h.nodeid = ff->nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ if (err == -ENOSYS) {
+ fc->no_fallocate = 1;
+ err = -EOPNOTSUPP;
+ }
+ fuse_put_request(fc, req);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fuse_file_fallocate);
+
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read = do_sync_read,
@@ -2190,6 +2228,7 @@ static const struct file_operations fuse_file_operations = {
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
+ .fallocate = fuse_file_fallocate,
};
static const struct file_operations fuse_direct_io_file_operations = {
@@ -2206,6 +2245,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
+ .fallocate = fuse_file_fallocate,
/* no splice_read */
};
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 572cefc78012..771fb6322c07 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -82,6 +82,9 @@ struct fuse_inode {
preserve the original mode */
umode_t orig_i_mode;
+ /** 64 bit inode number */
+ u64 orig_ino;
+
/** Version of last attribute change */
u64 attr_version;
@@ -478,6 +481,9 @@ struct fuse_conn {
/** Are BSD file locking primitives not implemented by fs? */
unsigned no_flock:1;
+ /** Is fallocate not implemented by fs? */
+ unsigned no_fallocate:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 42678a33b7bb..1cd61652018c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -91,6 +91,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi->nlookup = 0;
fi->attr_version = 0;
fi->writectr = 0;
+ fi->orig_ino = 0;
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
INIT_LIST_HEAD(&fi->writepages);
@@ -139,6 +140,18 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
return 0;
}
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t fuse_squash_ino(u64 ino64)
+{
+ ino_t ino = (ino_t) ino64;
+ if (sizeof(ino_t) < sizeof(u64))
+ ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8;
+ return ino;
+}
+
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
u64 attr_valid)
{
@@ -148,7 +161,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
fi->attr_version = ++fc->attr_version;
fi->i_time = attr_valid;
- inode->i_ino = attr->ino;
+ inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
set_nlink(inode, attr->nlink);
inode->i_uid = attr->uid;
@@ -174,6 +187,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
fi->orig_i_mode = inode->i_mode;
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
inode->i_mode &= ~S_ISVTX;
+
+ fi->orig_ino = attr->ino;
}
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 616f41a7cde6..437195f204e1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1803,7 +1803,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
- unsigned i_mode, f_mode = file->f_mode;
+ unsigned f_mode = file->f_mode;
rcu_read_unlock();
put_files_struct(files);
@@ -1819,12 +1819,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = GLOBAL_ROOT_GID;
}
- i_mode = S_IFLNK;
- if (f_mode & FMODE_READ)
- i_mode |= S_IRUSR | S_IXUSR;
- if (f_mode & FMODE_WRITE)
- i_mode |= S_IWUSR | S_IXUSR;
- inode->i_mode = i_mode;
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
security_task_to_inode(task, inode);
put_task_struct(task);
@@ -1859,6 +1861,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
ei = PROC_I(inode);
ei->fd = fd;
+ inode->i_mode = S_IFLNK;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 81e803e90aa4..acba894374a1 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -132,6 +132,7 @@ extern u64 clockevent_delta2ns(unsigned long latch,
struct clock_event_device *evt);
extern void clockevents_register_device(struct clock_event_device *dev);
+extern void clockevents_config(struct clock_event_device *dev, u32 freq);
extern void clockevents_config_and_register(struct clock_event_device *dev,
u32 freq, unsigned long min_delta,
unsigned long max_delta);
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
new file mode 100644
index 000000000000..0e4e2eec5c1d
--- /dev/null
+++ b/include/linux/frontswap.h
@@ -0,0 +1,127 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/bitops.h>
+
+struct frontswap_ops {
+ void (*init)(unsigned);
+ int (*store)(unsigned, pgoff_t, struct page *);
+ int (*load)(unsigned, pgoff_t, struct page *);
+ void (*invalidate_page)(unsigned, pgoff_t);
+ void (*invalidate_area)(unsigned);
+};
+
+extern bool frontswap_enabled;
+extern struct frontswap_ops
+ frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+extern void frontswap_writethrough(bool);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_store(struct page *page);
+extern int __frontswap_load(struct page *page);
+extern void __frontswap_invalidate_page(unsigned, pgoff_t);
+extern void __frontswap_invalidate_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+ bool ret = false;
+
+ if (frontswap_enabled && sis->frontswap_map)
+ ret = test_bit(offset, sis->frontswap_map);
+ return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+ if (frontswap_enabled && sis->frontswap_map)
+ set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ if (frontswap_enabled && sis->frontswap_map)
+ clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+ unsigned long *map)
+{
+ p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+ return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+ return false;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+ unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+ return NULL;
+}
+#endif
+
+static inline int frontswap_store(struct page *page)
+{
+ int ret = -1;
+
+ if (frontswap_enabled)
+ ret = __frontswap_store(page);
+ return ret;
+}
+
+static inline int frontswap_load(struct page *page)
+{
+ int ret = -1;
+
+ if (frontswap_enabled)
+ ret = __frontswap_load(page);
+ return ret;
+}
+
+static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ if (frontswap_enabled)
+ __frontswap_invalidate_page(type, offset);
+}
+
+static inline void frontswap_invalidate_area(unsigned type)
+{
+ if (frontswap_enabled)
+ __frontswap_invalidate_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+ if (frontswap_enabled)
+ __frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8f2ab8fef929..9303348965fb 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -54,6 +54,9 @@
* 7.18
* - add FUSE_IOCTL_DIR flag
* - add FUSE_NOTIFY_DELETE
+ *
+ * 7.19
+ * - add FUSE_FALLOCATE
*/
#ifndef _LINUX_FUSE_H
@@ -85,7 +88,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 18
+#define FUSE_KERNEL_MINOR_VERSION 19
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -278,6 +281,7 @@ enum fuse_opcode {
FUSE_POLL = 40,
FUSE_NOTIFY_REPLY = 41,
FUSE_BATCH_FORGET = 42,
+ FUSE_FALLOCATE = 43,
/* CUSE specific operations */
CUSE_INIT = 4096,
@@ -571,6 +575,14 @@ struct fuse_notify_poll_wakeup_out {
__u64 kh;
};
+struct fuse_fallocate_in {
+ __u64 fh;
+ __u64 offset;
+ __u64 length;
+ __u32 mode;
+ __u32 padding;
+};
+
struct fuse_in_header {
__u32 len;
__u32 opcode;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e4baff5f7ff4..9e65eff6af3b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,6 +149,7 @@ extern struct cred init_cred;
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
+ .nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
.se = { \
@@ -157,7 +158,6 @@ extern struct cred init_cred;
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = RR_TIMESLICE, \
- .nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f32578634d9d..45db49f64bb4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -555,6 +555,8 @@ enum perf_event_type {
PERF_RECORD_MAX, /* non-ABI */
};
+#define PERF_MAX_STACK_DEPTH 127
+
enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
@@ -609,8 +611,6 @@ struct perf_guest_info_callbacks {
#include <linux/sysfs.h>
#include <asm/local.h>
-#define PERF_MAX_STACK_DEPTH 255
-
struct perf_callchain_entry {
__u64 nr;
__u64 ip[PERF_MAX_STACK_DEPTH];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f34437e835a7..6029d8c54476 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void);
extern void calc_global_load(unsigned long ticks);
+extern void update_cpu_load_nohz(void);
extern unsigned long get_parent_ip(unsigned long addr);
@@ -1187,7 +1188,6 @@ struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
- int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1252,6 +1252,7 @@ struct task_struct {
#endif
unsigned int policy;
+ int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b6661933e252..c84ec68eaec9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,10 @@ struct swap_info_struct {
struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */
+#ifdef CONFIG_FRONTSWAP
+ unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
+ atomic_t frontswap_pages; /* frontswap pages in-use counter */
+#endif
};
struct swap_list_t {
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
new file mode 100644
index 000000000000..e282624e8c10
--- /dev/null
+++ b/include/linux/swapfile.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_SWAPFILE_H
+#define _LINUX_SWAPFILE_H
+
+/*
+ * these were static in swapfile.c but frontswap.c needs them and we don't
+ * want to expose them to the dozens of source files that include swap.h
+ */
+extern spinlock_t swap_lock;
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct *swap_info[];
+extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+#endif /* _LINUX_SWAPFILE_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..f85c0154b333 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- func(event);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
perf_event_for_each_child(sibling, func);
mutex_unlock(&ctx->mutex);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force);
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea0c6c2ae6f7..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -142,6 +142,25 @@ static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
#endif
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ int ret;
+
+ ret = chip->irq_set_affinity(data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
{
struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return -EINVAL;
if (irq_can_move_pcntxt(data)) {
- ret = chip->irq_set_affinity(data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(data->affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- ret = 0;
- }
+ ret = irq_do_set_affinity(data, mask, false);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
static int
setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
- struct irq_chip *chip = irq_desc_get_chip(desc);
struct cpumask *set = irq_default_affinity;
- int ret, node = desc->irq_data.node;
+ int node = desc->irq_data.node;
/* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
if (cpumask_intersects(mask, nodemask))
cpumask_and(mask, mask, nodemask);
}
- ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(desc->irq_data.affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- }
+ irq_do_set_affinity(&desc->irq_data, mask, false);
return 0;
}
#else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids)) {
- int ret = chip->irq_set_affinity(&desc->irq_data,
- desc->pending_mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- }
- }
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+ irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..c46958e26121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
/*
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
void update_idle_cpu_load(struct rq *this_rq)
{
- unsigned long curr_jiffies = jiffies;
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
unsigned long load = this_rq->load.weight;
unsigned long pending_updates;
/*
- * Bloody broken means of dealing with nohz, but better than nothing..
- * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
- * update and see 0 difference the one time and 2 the next, even though
- * we ticked at roughtly the same rate.
- *
- * Hence we only use this from nohz_idle_balance() and skip this
- * nonsense when called from the scheduler_tick() since that's
- * guaranteed a stable rate.
+ * bail if there's load or we're actually up-to-date.
*/
if (load || curr_jiffies == this_rq->last_load_update_tick)
return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
}
/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
* Called from scheduler_tick()
*/
static void update_cpu_load_active(struct rq *this_rq)
{
/*
- * See the mess in update_idle_cpu_load().
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
@@ -6403,7 +6438,7 @@ static void sched_init_numa(void)
return;
for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
if (!mask)
return;
@@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..b2a2d236f27b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..2a4e8dffbd6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
cpu = task_cpu(p);
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
goto out;
/* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->rt.nr_cpus_allowed < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio) &&
- (p->rt.nr_cpus_allowed > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1276,10 +1282,10 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->rt.nr_cpus_allowed == 1)
+ if (rq->curr->nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
(cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->rt.nr_cpus_allowed > 1))
+ (p->nr_cpus_allowed > 1))
return 1;
return 0;
}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->rt.nr_cpus_allowed == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1 &&
+ p->nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Only update if the process changes its state from whether it
* can migrate or not.
*/
- if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = RR_TIMESLICE;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are the
+ * only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
per_cpu(idle_threads, smp_processor_id()) = current;
}
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
static inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
}
/**
- * idle_thread_init - Initialize the idle thread for a cpu
- * @cpu: The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
+ * idle_threads_init - Initialize idle threads for all cpus
*/
void __init idle_threads_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, boot_cpu;
+
+ boot_cpu = smp_processor_id();
for_each_possible_cpu(cpu) {
- if (cpu != smp_processor_id())
+ if (cpu != boot_cpu)
idle_init(cpu);
}
}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
- u32 freq)
+void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..da70c6db496c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
/*
@@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
return HRTIMER_RESTART;
}
+static int sched_skew_tick;
+
+static int __init skew_tick(char *str)
+{
+ get_option(&str, &sched_skew_tick);
+
+ return 0;
+}
+early_param("skew_tick", skew_tick);
+
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
@@ -831,6 +842,14 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+ /* Offset the tick to avert xtime_lock contention. */
+ if (sched_skew_tick) {
+ u64 offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= smp_processor_id();
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+ }
+
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer,
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..e25025574a02
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ if (frontswap_enabled)
+ (*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = (*frontswap_ops.store)(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else if (dup) {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ frontswap_clear(sis, offset);
+ atomic_dec(&sis->frontswap_pages);
+ inc_frontswap_failed_stores();
+ } else
+ inc_frontswap_failed_stores();
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ ret = (*frontswap_ops.load)(type, offset, page);
+ if (ret == 0)
+ inc_frontswap_loads();
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset)) {
+ (*frontswap_ops.invalidate_page)(type, offset);
+ atomic_dec(&sis->frontswap_pages);
+ frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ (*frontswap_ops.invalidate_area)(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages = 0, total_pages_to_unuse;
+ unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
+ bool locked = false;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ locked = true;
+ total_pages = 0;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ total_pages += atomic_read(&si->frontswap_pages);
+ }
+ if (total_pages <= target_pages)
+ goto out;
+ total_pages_to_unuse = total_pages - target_pages;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages)
+ pages = pages_to_unuse = total_pages_to_unuse;
+ else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages))
+ continue;
+ vm_unacct_memory(pages);
+ break;
+ }
+ if (type < 0)
+ goto out;
+ locked = false;
+ spin_unlock(&swap_lock);
+ try_to_unuse(type, true, pages_to_unuse);
+out:
+ if (locked)
+ spin_unlock(&swap_lock);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ spin_lock(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += atomic_read(&si->frontswap_pages);
+ }
+ spin_unlock(&swap_lock);
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..de5bc51c4a66 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
if ((p->flags & SWP_BLKDEV) &&
disk->fops->swap_slot_free_notify)
disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map)
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
{
int i, prev;
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
- err = try_to_unuse(type);
+ err = try_to_unuse(type, false, 0); /* force all pages to be unused */
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map);
+ enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
goto out_dput;
}
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ vfree(frontswap_map_get(p));
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1988,6 +2011,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2071,6 +2095,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled)
+ frontswap_map = vzalloc(maxpages / sizeof(long));
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2113,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "");
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 5476bc0a1eac..b4b572e8c100 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -1,4 +1,6 @@
tools/perf
+tools/scripts
+tools/lib/traceevent
include/linux/const.h
include/linux/perf_event.h
include/linux/rbtree.h
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8c767c6bca91..25249f76329d 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -152,7 +152,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
if (symbol_conf.use_callchain) {
err = callchain_append(he->callchain,
- &evsel->hists.callchain_cursor,
+ &callchain_cursor,
sample->period);
if (err)
return err;
@@ -162,7 +162,7 @@ static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
* so we don't allocated the extra space needed because the stdio
* code will not use it.
*/
- if (al->sym != NULL && use_browser > 0) {
+ if (he->ms.sym != NULL && use_browser > 0) {
struct annotation *notes = symbol__annotation(he->ms.sym);
assert(evsel != NULL);
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 62ae30d34fa6..262589991ea4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1129,7 +1129,7 @@ static int add_default_attributes(void)
return 0;
if (!evsel_list->nr_entries) {
- if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0)
+ if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
return -1;
}
@@ -1139,21 +1139,21 @@ static int add_default_attributes(void)
return 0;
/* Append detailed run extra attributes: */
- if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0)
+ if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0)
return -1;
if (detailed_run < 2)
return 0;
/* Append very detailed run extra attributes: */
- if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0)
+ if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0)
return -1;
if (detailed_run < 3)
return 0;
/* Append very, very detailed run extra attributes: */
- return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs);
+ return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs);
}
int cmd_stat(int argc, const char **argv, const char *prefix __used)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 871b540293e1..6bb0277b7dfe 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -787,7 +787,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
}
if (symbol_conf.use_callchain) {
- err = callchain_append(he->callchain, &evsel->hists.callchain_cursor,
+ err = callchain_append(he->callchain, &callchain_cursor,
sample->period);
if (err)
return;
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index bd0bb1b1279b..67e5d0cace85 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -409,14 +409,15 @@ Counters can be enabled and disabled in two ways: via ioctl and via
prctl. When a counter is disabled, it doesn't count or generate
events but does continue to exist and maintain its count value.
-An individual counter or counter group can be enabled with
+An individual counter can be enabled with
- ioctl(fd, PERF_EVENT_IOC_ENABLE);
+ ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
or disabled with
- ioctl(fd, PERF_EVENT_IOC_DISABLE);
+ ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+For a counter group, pass PERF_IOC_FLAG_GROUP as the third argument.
Enabling or disabling the leader of a group enables or disables the
whole group; that is, while the group leader is disabled, none of the
counters in the group will count. Enabling or disabling a member of a
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 4deea6aaf927..34b1c46eaf42 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -668,7 +668,7 @@ static int annotate_browser__run(struct annotate_browser *browser, int evidx,
"q/ESC/CTRL+C Exit\n\n"
"-> Go to target\n"
"<- Exit\n"
- "h Cycle thru hottest instructions\n"
+ "H Cycle thru hottest instructions\n"
"j Toggle showing jump to target arrows\n"
"J Toggle showing number of jump sources on targets\n"
"n Search next string\n"
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
index ad73300f7bac..95264f304179 100755
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -12,7 +12,7 @@ LF='
# First check if there is a .git to get the version from git describe
# otherwise try to get the version from the kernel makefile
if test -d ../../.git -o -f ../../.git &&
- VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+ VN=$(git describe --match 'v[0-9].[0-9]*' --abbrev=4 HEAD 2>/dev/null) &&
case "$VN" in
*$LF*) (exit 1) ;;
v[0-9]*)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 9f7106a8d9a4..3a6bff47614f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -18,6 +18,8 @@
#include "util.h"
#include "callchain.h"
+__thread struct callchain_cursor callchain_cursor;
+
bool ip_callchain__valid(struct ip_callchain *chain,
const union perf_event *event)
{
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 7f9c0f1ae3a9..3bdb407f9cd9 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -76,6 +76,8 @@ struct callchain_cursor {
struct callchain_cursor_node *curr;
};
+extern __thread struct callchain_cursor callchain_cursor;
+
static inline void callchain_init(struct callchain_root *root)
{
INIT_LIST_HEAD(&root->node.siblings);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 4ac5f5ae4ce9..7400fb3fc50c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -159,6 +159,17 @@ out_delete_partial_list:
return -1;
}
+int __perf_evlist__add_default_attrs(struct perf_evlist *evlist,
+ struct perf_event_attr *attrs, size_t nr_attrs)
+{
+ size_t i;
+
+ for (i = 0; i < nr_attrs; i++)
+ event_attr_init(attrs + i);
+
+ return perf_evlist__add_attrs(evlist, attrs, nr_attrs);
+}
+
static int trace_event__id(const char *evname)
{
char *filename, *colon;
@@ -263,7 +274,8 @@ void perf_evlist__disable(struct perf_evlist *evlist)
for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
list_for_each_entry(pos, &evlist->entries, node) {
for (thread = 0; thread < evlist->threads->nr; thread++)
- ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_DISABLE);
+ ioctl(FD(pos, cpu, thread),
+ PERF_EVENT_IOC_DISABLE, 0);
}
}
}
@@ -276,7 +288,8 @@ void perf_evlist__enable(struct perf_evlist *evlist)
for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
list_for_each_entry(pos, &evlist->entries, node) {
for (thread = 0; thread < evlist->threads->nr; thread++)
- ioctl(FD(pos, cpu, thread), PERF_EVENT_IOC_ENABLE);
+ ioctl(FD(pos, cpu, thread),
+ PERF_EVENT_IOC_ENABLE, 0);
}
}
}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 58abb63ac13a..989bee9624c2 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -54,6 +54,8 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
int perf_evlist__add_default(struct perf_evlist *evlist);
int perf_evlist__add_attrs(struct perf_evlist *evlist,
struct perf_event_attr *attrs, size_t nr_attrs);
+int __perf_evlist__add_default_attrs(struct perf_evlist *evlist,
+ struct perf_event_attr *attrs, size_t nr_attrs);
int perf_evlist__add_tracepoints(struct perf_evlist *evlist,
const char *tracepoints[], size_t nr_tracepoints);
int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
@@ -62,6 +64,8 @@ int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
#define perf_evlist__add_attrs_array(evlist, array) \
perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array))
+#define perf_evlist__add_default_attrs(evlist, array) \
+ __perf_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array))
#define perf_evlist__add_tracepoints_array(evlist, array) \
perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array))
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 91d19138f3ec..9f6cebd798ee 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -494,16 +494,24 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
}
static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
- struct perf_sample *sample)
+ struct perf_sample *sample,
+ bool swapped)
{
const u64 *array = event->sample.array;
+ union u64_swap u;
array += ((event->header.size -
sizeof(event->header)) / sizeof(u64)) - 1;
if (type & PERF_SAMPLE_CPU) {
- u32 *p = (u32 *)array;
- sample->cpu = *p;
+ u.val64 = *array;
+ if (swapped) {
+ /* undo swap of u64, then swap on individual u32s */
+ u.val64 = bswap_64(u.val64);
+ u.val32[0] = bswap_32(u.val32[0]);
+ }
+
+ sample->cpu = u.val32[0];
array--;
}
@@ -523,9 +531,16 @@ static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
}
if (type & PERF_SAMPLE_TID) {
- u32 *p = (u32 *)array;
- sample->pid = p[0];
- sample->tid = p[1];
+ u.val64 = *array;
+ if (swapped) {
+ /* undo swap of u64, then swap on individual u32s */
+ u.val64 = bswap_64(u.val64);
+ u.val32[0] = bswap_32(u.val32[0]);
+ u.val32[1] = bswap_32(u.val32[1]);
+ }
+
+ sample->pid = u.val32[0];
+ sample->tid = u.val32[1];
}
return 0;
@@ -562,7 +577,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
if (event->header.type != PERF_RECORD_SAMPLE) {
if (!sample_id_all)
return 0;
- return perf_event__parse_id_sample(event, type, data);
+ return perf_event__parse_id_sample(event, type, data, swapped);
}
array = event->sample.array;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 1293b5ebea4d..514e2a4b367d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -378,7 +378,7 @@ void hist_entry__free(struct hist_entry *he)
* collapse the histogram
*/
-static bool hists__collapse_insert_entry(struct hists *hists,
+static bool hists__collapse_insert_entry(struct hists *hists __used,
struct rb_root *root,
struct hist_entry *he)
{
@@ -397,8 +397,9 @@ static bool hists__collapse_insert_entry(struct hists *hists,
iter->period += he->period;
iter->nr_events += he->nr_events;
if (symbol_conf.use_callchain) {
- callchain_cursor_reset(&hists->callchain_cursor);
- callchain_merge(&hists->callchain_cursor, iter->callchain,
+ callchain_cursor_reset(&callchain_cursor);
+ callchain_merge(&callchain_cursor,
+ iter->callchain,
he->callchain);
}
hist_entry__free(he);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index cfc64e293f90..34bb556d6219 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -67,8 +67,6 @@ struct hists {
struct events_stats stats;
u64 event_stream;
u16 col_len[HISTC_NR_COLS];
- /* Best would be to reuse the session callchain cursor */
- struct callchain_cursor callchain_cursor;
};
struct hist_entry *__hists__add_entry(struct hists *self,
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
index 1915de20dcac..3322b8446e89 100644
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -57,6 +57,10 @@ void setup_pager(void)
}
if (!pager)
pager = getenv("PAGER");
+ if (!pager) {
+ if (!access("/usr/bin/pager", X_OK))
+ pager = "/usr/bin/pager";
+ }
if (!pager)
pager = "less";
else if (!*pager || !strcmp(pager, "cat"))
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 59dccc98b554..0dda25d82d06 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -2164,16 +2164,12 @@ int del_perf_probe_events(struct strlist *dellist)
error:
if (kfd >= 0) {
- if (namelist)
- strlist__delete(namelist);
-
+ strlist__delete(namelist);
close(kfd);
}
if (ufd >= 0) {
- if (unamelist)
- strlist__delete(unamelist);
-
+ strlist__delete(unamelist);
close(ufd);
}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 93d355d27109..2600916efa83 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -288,7 +288,8 @@ struct branch_info *machine__resolve_bstack(struct machine *self,
return bi;
}
-int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
+int machine__resolve_callchain(struct machine *self,
+ struct perf_evsel *evsel __used,
struct thread *thread,
struct ip_callchain *chain,
struct symbol **parent)
@@ -297,7 +298,12 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
unsigned int i;
int err;
- callchain_cursor_reset(&evsel->hists.callchain_cursor);
+ callchain_cursor_reset(&callchain_cursor);
+
+ if (chain->nr > PERF_MAX_STACK_DEPTH) {
+ pr_warning("corrupted callchain. skipping...\n");
+ return 0;
+ }
for (i = 0; i < chain->nr; i++) {
u64 ip;
@@ -317,7 +323,14 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
case PERF_CONTEXT_USER:
cpumode = PERF_RECORD_MISC_USER; break;
default:
- break;
+ pr_debug("invalid callchain context: "
+ "%"PRId64"\n", (s64) ip);
+ /*
+ * It seems the callchain is corrupted.
+ * Discard all.
+ */
+ callchain_cursor_reset(&callchain_cursor);
+ return 0;
}
continue;
}
@@ -333,7 +346,7 @@ int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
break;
}
- err = callchain_cursor_append(&evsel->hists.callchain_cursor,
+ err = callchain_cursor_append(&callchain_cursor,
ip, al.map, al.sym);
if (err)
return err;
@@ -441,37 +454,65 @@ void mem_bswap_64(void *src, int byte_size)
}
}
-static void perf_event__all64_swap(union perf_event *event)
+static void swap_sample_id_all(union perf_event *event, void *data)
+{
+ void *end = (void *) event + event->header.size;
+ int size = end - data;
+
+ BUG_ON(size % sizeof(u64));
+ mem_bswap_64(data, size);
+}
+
+static void perf_event__all64_swap(union perf_event *event,
+ bool sample_id_all __used)
{
struct perf_event_header *hdr = &event->header;
mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr));
}
-static void perf_event__comm_swap(union perf_event *event)
+static void perf_event__comm_swap(union perf_event *event, bool sample_id_all)
{
event->comm.pid = bswap_32(event->comm.pid);
event->comm.tid = bswap_32(event->comm.tid);
+
+ if (sample_id_all) {
+ void *data = &event->comm.comm;
+
+ data += ALIGN(strlen(data) + 1, sizeof(u64));
+ swap_sample_id_all(event, data);
+ }
}
-static void perf_event__mmap_swap(union perf_event *event)
+static void perf_event__mmap_swap(union perf_event *event,
+ bool sample_id_all)
{
event->mmap.pid = bswap_32(event->mmap.pid);
event->mmap.tid = bswap_32(event->mmap.tid);
event->mmap.start = bswap_64(event->mmap.start);
event->mmap.len = bswap_64(event->mmap.len);
event->mmap.pgoff = bswap_64(event->mmap.pgoff);
+
+ if (sample_id_all) {
+ void *data = &event->mmap.filename;
+
+ data += ALIGN(strlen(data) + 1, sizeof(u64));
+ swap_sample_id_all(event, data);
+ }
}
-static void perf_event__task_swap(union perf_event *event)
+static void perf_event__task_swap(union perf_event *event, bool sample_id_all)
{
event->fork.pid = bswap_32(event->fork.pid);
event->fork.tid = bswap_32(event->fork.tid);
event->fork.ppid = bswap_32(event->fork.ppid);
event->fork.ptid = bswap_32(event->fork.ptid);
event->fork.time = bswap_64(event->fork.time);
+
+ if (sample_id_all)
+ swap_sample_id_all(event, &event->fork + 1);
}
-static void perf_event__read_swap(union perf_event *event)
+static void perf_event__read_swap(union perf_event *event, bool sample_id_all)
{
event->read.pid = bswap_32(event->read.pid);
event->read.tid = bswap_32(event->read.tid);
@@ -479,6 +520,9 @@ static void perf_event__read_swap(union perf_event *event)
event->read.time_enabled = bswap_64(event->read.time_enabled);
event->read.time_running = bswap_64(event->read.time_running);
event->read.id = bswap_64(event->read.id);
+
+ if (sample_id_all)
+ swap_sample_id_all(event, &event->read + 1);
}
static u8 revbyte(u8 b)
@@ -530,7 +574,8 @@ void perf_event__attr_swap(struct perf_event_attr *attr)
swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64));
}
-static void perf_event__hdr_attr_swap(union perf_event *event)
+static void perf_event__hdr_attr_swap(union perf_event *event,
+ bool sample_id_all __used)
{
size_t size;
@@ -541,18 +586,21 @@ static void perf_event__hdr_attr_swap(union perf_event *event)
mem_bswap_64(event->attr.id, size);
}
-static void perf_event__event_type_swap(union perf_event *event)
+static void perf_event__event_type_swap(union perf_event *event,
+ bool sample_id_all __used)
{
event->event_type.event_type.event_id =
bswap_64(event->event_type.event_type.event_id);
}
-static void perf_event__tracing_data_swap(union perf_event *event)
+static void perf_event__tracing_data_swap(union perf_event *event,
+ bool sample_id_all __used)
{
event->tracing_data.size = bswap_32(event->tracing_data.size);
}
-typedef void (*perf_event__swap_op)(union perf_event *event);
+typedef void (*perf_event__swap_op)(union perf_event *event,
+ bool sample_id_all);
static perf_event__swap_op perf_event__swap_ops[] = {
[PERF_RECORD_MMAP] = perf_event__mmap_swap,
@@ -986,6 +1034,15 @@ static int perf_session__process_user_event(struct perf_session *session, union
}
}
+static void event_swap(union perf_event *event, bool sample_id_all)
+{
+ perf_event__swap_op swap;
+
+ swap = perf_event__swap_ops[event->header.type];
+ if (swap)
+ swap(event, sample_id_all);
+}
+
static int perf_session__process_event(struct perf_session *session,
union perf_event *event,
struct perf_tool *tool,
@@ -994,9 +1051,8 @@ static int perf_session__process_event(struct perf_session *session,
struct perf_sample sample;
int ret;
- if (session->header.needs_swap &&
- perf_event__swap_ops[event->header.type])
- perf_event__swap_ops[event->header.type](event);
+ if (session->header.needs_swap)
+ event_swap(event, session->sample_id_all);
if (event->header.type >= PERF_RECORD_HEADER_MAX)
return -EINVAL;
@@ -1428,7 +1484,6 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
int print_sym, int print_dso, int print_symoffset)
{
struct addr_location al;
- struct callchain_cursor *cursor = &evsel->hists.callchain_cursor;
struct callchain_cursor_node *node;
if (perf_event__preprocess_sample(event, machine, &al, sample,
@@ -1446,10 +1501,10 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
error("Failed to resolve callchain. Skipping\n");
return;
}
- callchain_cursor_commit(cursor);
+ callchain_cursor_commit(&callchain_cursor);
while (1) {
- node = callchain_cursor_current(cursor);
+ node = callchain_cursor_current(&callchain_cursor);
if (!node)
break;
@@ -1460,12 +1515,12 @@ void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
}
if (print_dso) {
printf(" (");
- map__fprintf_dsoname(al.map, stdout);
+ map__fprintf_dsoname(node->map, stdout);
printf(")");
}
printf("\n");
- callchain_cursor_advance(cursor);
+ callchain_cursor_advance(&callchain_cursor);
}
} else {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e2ba8858f3e1..3e2e5ea0f03f 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -323,6 +323,7 @@ struct dso *dso__new(const char *name)
dso->sorted_by_name = 0;
dso->has_build_id = 0;
dso->kernel = DSO_TYPE_USER;
+ dso->needs_swap = DSO_SWAP__UNSET;
INIT_LIST_HEAD(&dso->node);
}
@@ -1156,6 +1157,33 @@ static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr)
return -1;
}
+static int dso__swap_init(struct dso *dso, unsigned char eidata)
+{
+ static unsigned int const endian = 1;
+
+ dso->needs_swap = DSO_SWAP__NO;
+
+ switch (eidata) {
+ case ELFDATA2LSB:
+ /* We are big endian, DSO is little endian. */
+ if (*(unsigned char const *)&endian != 1)
+ dso->needs_swap = DSO_SWAP__YES;
+ break;
+
+ case ELFDATA2MSB:
+ /* We are little endian, DSO is big endian. */
+ if (*(unsigned char const *)&endian != 0)
+ dso->needs_swap = DSO_SWAP__YES;
+ break;
+
+ default:
+ pr_err("unrecognized DSO data encoding %d\n", eidata);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int dso__load_sym(struct dso *dso, struct map *map, const char *name,
int fd, symbol_filter_t filter, int kmodule,
int want_symtab)
@@ -1187,6 +1215,9 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name,
goto out_elf_end;
}
+ if (dso__swap_init(dso, ehdr.e_ident[EI_DATA]))
+ goto out_elf_end;
+
/* Always reject images with a mismatched build-id: */
if (dso->has_build_id) {
u8 build_id[BUILD_ID_SIZE];
@@ -1272,7 +1303,7 @@ static int dso__load_sym(struct dso *dso, struct map *map, const char *name,
if (opdsec && sym.st_shndx == opdidx) {
u32 offset = sym.st_value - opdshdr.sh_addr;
u64 *opd = opddata->d_buf + offset;
- sym.st_value = *opd;
+ sym.st_value = DSO__SWAP(dso, u64, *opd);
sym.st_shndx = elf_addr_to_index(elf, sym.st_value);
}
@@ -2786,8 +2817,11 @@ int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
struct map *dso__new_map(const char *name)
{
+ struct map *map = NULL;
struct dso *dso = dso__new(name);
- struct map *map = map__new2(0, dso, MAP__FUNCTION);
+
+ if (dso)
+ map = map__new2(0, dso, MAP__FUNCTION);
return map;
}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 5649d63798cb..af0752b1aca1 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
#include <stdio.h>
+#include <byteswap.h>
#ifdef HAVE_CPLUS_DEMANGLE
extern char *cplus_demangle(const char *, int);
@@ -160,11 +161,18 @@ enum dso_kernel_type {
DSO_TYPE_GUEST_KERNEL
};
+enum dso_swap_type {
+ DSO_SWAP__UNSET,
+ DSO_SWAP__NO,
+ DSO_SWAP__YES,
+};
+
struct dso {
struct list_head node;
struct rb_root symbols[MAP__NR_TYPES];
struct rb_root symbol_names[MAP__NR_TYPES];
enum dso_kernel_type kernel;
+ enum dso_swap_type needs_swap;
u8 adjust_symbols:1;
u8 has_build_id:1;
u8 hit:1;
@@ -182,6 +190,28 @@ struct dso {
char name[0];
};
+#define DSO__SWAP(dso, type, val) \
+({ \
+ type ____r = val; \
+ BUG_ON(dso->needs_swap == DSO_SWAP__UNSET); \
+ if (dso->needs_swap == DSO_SWAP__YES) { \
+ switch (sizeof(____r)) { \
+ case 2: \
+ ____r = bswap_16(val); \
+ break; \
+ case 4: \
+ ____r = bswap_32(val); \
+ break; \
+ case 8: \
+ ____r = bswap_64(val); \
+ break; \
+ default: \
+ BUG_ON(1); \
+ } \
+ } \
+ ____r; \
+})
+
struct dso *dso__new(const char *name);
void dso__delete(struct dso *dso);