From 739f79fc9db1b38f96b5a5109b247a650fbebf6d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 18 Aug 2017 15:15:48 -0700
Subject: mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()

Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:

    BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
    IP: test_clear_page_writeback+0x12e/0x2c0
    [...]
    RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
    Call Trace:
     <IRQ>
     end_page_writeback+0x47/0x70
     f2fs_write_end_io+0x76/0x180 [f2fs]
     bio_endio+0x9f/0x120
     blk_update_request+0xa8/0x2f0
     scsi_end_request+0x39/0x1d0
     scsi_io_completion+0x211/0x690
     scsi_finish_command+0xd9/0x120
     scsi_softirq_done+0x127/0x150
     __blk_mq_complete_request_remote+0x13/0x20
     flush_smp_call_function_queue+0x56/0x110
     generic_smp_call_function_single_interrupt+0x13/0x30
     smp_call_function_single_interrupt+0x27/0x40
     call_function_single_interrupt+0x89/0x90
    RIP: 0010:native_safe_halt+0x6/0x10

    (gdb) l *(test_clear_page_writeback+0x12e)
    0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
    614		mod_node_page_state(page_pgdat(page), idx, val);
    615		if (mem_cgroup_disabled() || !page->mem_cgroup)
    616			return;
    617		mod_memcg_state(page->mem_cgroup, idx, val);
    618		pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
    619		this_cpu_add(pn->lruvec_stat->count[idx], val);
    620	}
    621
    622	unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
    623							gfp_t gfp_mask,

The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback().  The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles.  But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.

It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared.  Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.

Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward.  It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.

Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++--
 mm/memcontrol.c            | 43 +++++++++++++++++++++++++++++++------------
 mm/page-writeback.c        | 15 ++++++++++++---
 3 files changed, 51 insertions(+), 17 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3914e3dd6168..9b15a4bcfa77 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void lock_page_memcg(struct page *page)
+static inline struct mem_cgroup *lock_page_memcg(struct page *page)
+{
+	return NULL;
+}
+
+static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..e09741af816f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1611,9 +1611,13 @@ cleanup:
  * @page: the page
  *
  * This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
  */
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
 {
 	struct mem_cgroup *memcg;
 	unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
 	 * The RCU lock is held throughout the transaction.  The fast
 	 * path can get away without acquiring the memcg->move_lock
 	 * because page moving starts with an RCU grace period.
-	 */
+	 *
+	 * The RCU lock also protects the memcg from being freed when
+	 * the page state that is going to change is the only thing
+	 * preventing the page itself from being freed. E.g. writeback
+	 * doesn't hold a page reference and relies on PG_writeback to
+	 * keep off truncation, migration and so forth.
+         */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
-		return;
+		return NULL;
 again:
 	memcg = page->mem_cgroup;
 	if (unlikely(!memcg))
-		return;
+		return NULL;
 
 	if (atomic_read(&memcg->moving_account) <= 0)
-		return;
+		return memcg;
 
 	spin_lock_irqsave(&memcg->move_lock, flags);
 	if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ again:
 	memcg->move_lock_task = current;
 	memcg->move_lock_flags = flags;
 
-	return;
+	return memcg;
 }
 EXPORT_SYMBOL(lock_page_memcg);
 
 /**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
  */
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = page->mem_cgroup;
-
 	if (memcg && memcg->move_lock_task == current) {
 		unsigned long flags = memcg->move_lock_flags;
 
@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
 
 	rcu_read_unlock();
 }
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+	__unlock_page_memcg(page->mem_cgroup);
+}
 EXPORT_SYMBOL(unlock_page_memcg);
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..bf050ab025b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
+	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 	int ret;
 
-	lock_page_memcg(page);
+	memcg = lock_page_memcg(page);
+	lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 	if (mapping && mapping_use_writeback_tags(mapping)) {
 		struct inode *inode = mapping->host;
 		struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
+	/*
+	 * NOTE: Page might be free now! Writeback doesn't hold a page
+	 * reference on its own, it relies on truncation to wait for
+	 * the clearing of PG_writeback. The below can only access
+	 * page state that is static across allocation cycles.
+	 */
 	if (ret) {
-		dec_lruvec_page_state(page, NR_WRITEBACK);
+		dec_lruvec_state(lruvec, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
 	}
-	unlock_page_memcg(page);
+	__unlock_page_memcg(memcg);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 92e5aae457787d0bc6b255200d2fb116edf69794 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 18 Aug 2017 15:15:51 -0700
Subject: kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog

Commit 05a4a9527931 ("kernel/watchdog: split up config options") lost
the perf-based hardlockup detector's dependency on PERF_EVENTS, which
can result in broken builds with some powerpc configurations.

Restore the dependency.  Add it in for x86 too, despite x86 always
selecting PERF_EVENTS it seems reasonable to make the dependency
explicit.

Link: http://lkml.kernel.org/r/20170810114452.6673-1-npiggin@gmail.com
Fixes: 05a4a9527931 ("kernel/watchdog: split up config options")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig | 2 +-
 arch/x86/Kconfig     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
 	select HAVE_OPTPROBES			if PPC64
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI		if PPC64
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..29a1bf85e507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -163,7 +163,7 @@ config X86
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
-- 
cgit v1.2.3


From 8ada92799ec4de00f4bc0f10b1ededa256c1ab22 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 18 Aug 2017 15:15:55 -0700
Subject: wait: add wait_event_killable_timeout()

These are the few pending fixes I have queued up for v4.13-final.  One
is a a generic regression fix for recursive loops on kmod and the other
one is a trivial print out correction.

During the v4.13 development we assumed that recursive kmod loops were
no longer possible.  Clearly that is not true.  The regression fix makes
use of a new killable wait.  We use a killable wait to be paranoid in
how signals might be sent to modprobe and only accept a proper SIGKILL.
The signal will only be available to userspace to issue *iff* a thread
has already entered a wait state, and that happens only if we've already
throttled after 50 kmod threads have been hit.

Note that although it may seem excessive to trigger a failure afer 5
seconds if all kmod thread remain busy, prior to the series of changes
that went into v4.13 we would actually *always* fatally fail any request
which came in if the limit was already reached.  The new waiting
implemented in v4.13 actually gives us *more* breathing room -- the wait
for 5 seconds is a wait for *any* kmod thread to finish.  We give up and
fail *iff* no kmod thread has finished and they're *all* running
straight for 5 consecutive seconds.  If 50 kmod threads are running
consecutively for 5 seconds something else must be really bad.

Recursive loops with kmod are bad but they're also hard to implement
properly as a selftest without currently fooling current userspace tools
like kmod [1].  For instance kmod will complain when you run depmod if
it finds a recursive loop with symbol dependency between modules as such
this type of recursive loop cannot go upstream as the modules_install
target will fail after running depmod.

These tests already exist on userspace kmod upstream though (refer to
the testsuite/module-playground/mod-loop-*.c files).  The same is not
true if request_module() is used though, or worst if aliases are used.

Likewise the issue with 64-bit kernels booting 32-bit userspace without
a binfmt handler built-in is also currently not detected and proactively
avoided by userspace kmod tools, or kconfig for all architectures.
Although we could complain in the kernel when some of these individual
recursive issues creep up, proactively avoiding these situations in
userspace at build time is what we should keep striving for.

Lastly, since recursive loops could happen with kmod it may mean
recursive loops may also be possible with other kernel usermode helpers,
this should be investigated and long term if we can come up with a more
sensible generic solution even better!

[0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/log/?h=20170809-kmod-for-v4.13-final
[1] https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git

This patch (of 3):

This wait is similar to wait_event_interruptible_timeout() but only
accepts SIGKILL interrupt signal.  Other signals are ignored.

Link: http://lkml.kernel.org/r/20170809234635.13443-2-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Matt Redfearn <matt.redfearn@imgetc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36c0ca8..dc19880c02f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
 	__ret;									\
 })
 
+#define __wait_event_killable_timeout(wq_head, condition, timeout)		\
+	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
+		      TASK_KILLABLE, 0, timeout,				\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_KILLABLE) until the
+ * @condition evaluates to true or a kill signal is received.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a kill signal.
+ *
+ * Only kill signals interrupt this process.
+ */
+#define wait_event_killable_timeout(wq_head, condition, timeout)		\
+({										\
+	long __ret = timeout;							\
+	might_sleep();								\
+	if (!___wait_cond_timeout(condition))					\
+		__ret = __wait_event_killable_timeout(wq_head,			\
+						condition, timeout);		\
+	__ret;									\
+})
+
 
 #define __wait_event_lock_irq(wq_head, condition, lock, cmd)			\
 	(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
-- 
cgit v1.2.3


From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 18 Aug 2017 15:15:58 -0700
Subject: kmod: fix wait on recursive loop

Recursive loops with module loading were previously handled in kmod by
restricting the number of modprobe calls to 50 and if that limit was
breached request_module() would return an error and a user would see the
following on their kernel dmesg:

  request_module: runaway loop modprobe binfmt-464c
  Starting init:/sbin/init exists but couldn't execute it (error -8)

This issue could happen for instance when a 64-bit kernel boots a 32-bit
userspace on some architectures and has no 32-bit binary format
hanlders.  This is visible, for instance, when a CONFIG_MODULES enabled
64-bit MIPS kernel boots a into o32 root filesystem and the binfmt
handler for o32 binaries is not built-in.

After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now
don't have any visible signs of an error and the kernel just waits for
the loop to end somehow.

Although this *particular* recursive loop could also be addressed by
doing a sanity check on search_binary_handler() and disallowing a
modular binfmt to be required for modprobe, a generic solution for any
recursive kernel kmod issues is still needed.

This should catch these loops.  We can investigate each loop and address
each one separately as they come in, this however puts a stop gap for
them as before.

Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org
Fixes: 6d7964a722af ("kmod: throttle kmod thread limit")
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: Matt Redfearn <matt.redfearn@imgtec.com>
Tested-by: Matt Redfearn <matt.redfearn@imgetc.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5d97c8..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem);
 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
 static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
 
+/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+
 /*
 	modprobe_path is set via /proc/sys.
 */
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
 		pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
 				    atomic_read(&kmod_concurrent_max),
 				    MAX_KMOD_CONCURRENT, module_name);
-		wait_event_interruptible(kmod_wq,
-					 atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
+		ret = wait_event_killable_timeout(kmod_wq,
+						  atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+						  MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+		if (!ret) {
+			pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+					    module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+			return -ETIME;
+		} else if (ret == -ERESTARTSYS) {
+			pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+			return ret;
+		}
 	}
 
 	trace_module_request(module_name, wait, _RET_IP_);
-- 
cgit v1.2.3


From 768dc4e48420955518974d8486c1b00ec05e7274 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 18 Aug 2017 15:16:02 -0700
Subject: test_kmod: fix description for -s -and -c parameters

The descriptions were reversed, correct this.

Link: http://lkml.kernel.org/r/20170809234635.13443-4-mcgrof@kernel.org
Fixes: 64b671204afd71 ("test_sysctl: add generic script to expand on tests")
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reported-by: Daniel Mentz <danielmentz@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matt Redfearn <matt.redfearn@imgetc.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Michal Marek <mmarek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/kmod/kmod.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
index 8cecae9a8bca..7956ea3be667 100755
--- a/tools/testing/selftests/kmod/kmod.sh
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -473,8 +473,8 @@ usage()
 	echo "    all     Runs all tests (default)"
 	echo "    -t      Run test ID the number amount of times is recommended"
 	echo "    -w      Watch test ID run until it runs into an error"
-	echo "    -c      Run test ID once"
-	echo "    -s      Run test ID x test-count number of times"
+	echo "    -s      Run test ID once"
+	echo "    -c      Run test ID x test-count number of times"
 	echo "    -l      List all test ID list"
 	echo " -h|--help  Help"
 	echo
-- 
cgit v1.2.3


From 3010f876500f9ba921afaeccec30c45ca6584dc8 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Fri, 18 Aug 2017 15:16:05 -0700
Subject: mm: discard memblock data later

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
        free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages
are initialized in this path:

        deferred_init_memmap()
                for_each_mem_pfn_range()
                  __next_mem_pfn_range()
                    type = &memblock.memory;

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been
exceeded at least on systems where deferred struct pages were enabled.

Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128,
and verifying in qemu that this code is getting excuted and that the
freed pages are sane.

Link: http://lkml.kernel.org/r/1502485554-318703-2-git-send-email-pasha.tatashin@oracle.com
Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  6 ++++--
 mm/memblock.c            | 38 +++++++++++++++++---------------------
 mm/nobootmem.c           | 16 ----------------
 mm/page_alloc.c          |  4 ++++
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
 					int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 				phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-					phys_addr_t *addr)
-{
-	if (memblock.reserved.regions == memblock_reserved_init_regions)
-		return 0;
-
-	*addr = __pa(memblock.reserved.regions);
-
-	return PAGE_ALIGN(sizeof(struct memblock_region) *
-			  memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-					phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
 {
-	if (memblock.memory.regions == memblock_memory_init_regions)
-		return 0;
+	phys_addr_t addr, size;
 
-	*addr = __pa(memblock.memory.regions);
+	if (memblock.reserved.regions != memblock_reserved_init_regions) {
+		addr = __pa(memblock.reserved.regions);
+		size = PAGE_ALIGN(sizeof(struct memblock_region) *
+				  memblock.reserved.max);
+		__memblock_free_late(addr, size);
+	}
 
-	return PAGE_ALIGN(sizeof(struct memblock_region) *
-			  memblock.memory.max);
+	if (memblock.memory.regions == memblock_memory_init_regions) {
+		addr = __pa(memblock.memory.regions);
+		size = PAGE_ALIGN(sizeof(struct memblock_region) *
+				  memblock.memory.max);
+		__memblock_free_late(addr, size);
+	}
 }
-
 #endif
 
 /**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
 				NULL)
 		count += __free_memory_core(start, end);
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-	{
-		phys_addr_t size;
-
-		/* Free memblock.reserved array if it was allocated */
-		size = get_allocated_memblock_reserved_regions_info(&start);
-		if (size)
-			count += __free_memory_core(start, start + size);
-
-		/* Free memblock.memory array if it was allocated */
-		size = get_allocated_memblock_memory_regions_info(&start);
-		if (size)
-			count += __free_memory_core(start, start + size);
-	}
-#endif
-
 	return count;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f746c2fd..1bad301820c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void)
 	/* Reinit limits that are based on free pages after the kernel is up */
 	files_maxfiles_init();
 #endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+	/* Discard memblock private memory */
+	memblock_discard();
+#endif
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
-- 
cgit v1.2.3


From f6ba488073fe8159851fe398cc3c5ee383bb4c7a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Date: Fri, 18 Aug 2017 15:16:08 -0700
Subject: slub: fix per memcg cache leak on css offline

To avoid a possible deadlock, sysfs_slab_remove() schedules an
asynchronous work to delete sysfs entries corresponding to the kmem
cache.  To ensure the cache isn't freed before the work function is
called, it takes a reference to the cache kobject.  The reference is
supposed to be released by the work function.

However, the work function (sysfs_slab_remove_workfn()) does nothing in
case the cache sysfs entry has already been deleted, leaking the kobject
and the corresponding cache.

This may happen on a per memcg cache destruction, because sysfs entries
of a per memcg cache are deleted on memcg offline if the cache is empty
(see __kmemcg_cache_deactivate()).

The kmemleak report looks like this:

  unreferenced object 0xffff9f798a79f540 (size 32):
    comm "kworker/1:4", pid 15416, jiffies 4307432429 (age 28687.554s)
    hex dump (first 32 bytes):
      6b 6d 61 6c 6c 6f 63 2d 31 36 28 31 35 39 39 3a  kmalloc-16(1599:
      6e 65 77 72 6f 6f 74 29 00 23 6b c0 ff ff ff ff  newroot).#k.....
    backtrace:
       kmemleak_alloc+0x4a/0xa0
       __kmalloc_track_caller+0x148/0x2c0
       kvasprintf+0x66/0xd0
       kasprintf+0x49/0x70
       memcg_create_kmem_cache+0xe6/0x160
       memcg_kmem_cache_create_func+0x20/0x110
       process_one_work+0x205/0x5d0
       worker_thread+0x4e/0x3a0
       kthread+0x109/0x140
       ret_from_fork+0x2a/0x40
  unreferenced object 0xffff9f79b6136840 (size 416):
    comm "kworker/1:4", pid 15416, jiffies 4307432429 (age 28687.573s)
    hex dump (first 32 bytes):
      40 fb 80 c2 3e 33 00 00 00 00 00 40 00 00 00 00  @...>3.....@....
      00 00 00 00 00 00 00 00 10 00 00 00 10 00 00 00  ................
    backtrace:
       kmemleak_alloc+0x4a/0xa0
       kmem_cache_alloc+0x128/0x280
       create_cache+0x3b/0x1e0
       memcg_create_kmem_cache+0x118/0x160
       memcg_kmem_cache_create_func+0x20/0x110
       process_one_work+0x205/0x5d0
       worker_thread+0x4e/0x3a0
       kthread+0x109/0x140
       ret_from_fork+0x2a/0x40

Fix the leak by adding the missing call to kobject_put() to
sysfs_slab_remove_workfn().

Link: http://lkml.kernel.org/r/20170812181134.25027-1-vdavydov.dev@gmail.com
Fixes: 3b7b314053d02 ("slub: make sysfs file removal asynchronous")
Signed-off-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Reported-by: Andrei Vagin <avagin@gmail.com>
Tested-by: Andrei Vagin <avagin@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <stable@vger.kernel.org>	[4.12.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 1d3f9835f4ea..e8b4e31162ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
 		 * A cache is never shut down before deactivation is
 		 * complete, so no need to worry about synchronization.
 		 */
-		return;
+		goto out;
 
 #ifdef CONFIG_MEMCG
 	kset_unregister(s->memcg_kset);
 #endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
 	kobject_del(&s->kobj);
+out:
 	kobject_put(&s->kobj);
 }
 
-- 
cgit v1.2.3


From 5b53a6ea886700a128b697a6fe8375340dea2c30 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 18 Aug 2017 15:16:12 -0700
Subject: mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS

Tetsuo Handa has noticed that MMF_UNSTABLE SIGBUS path in
handle_mm_fault causes a lockdep splat

  Out of memory: Kill process 1056 (a.out) score 603 or sacrifice child
  Killed process 1056 (a.out) total-vm:4268108kB, anon-rss:2246048kB, file-rss:0kB, shmem-rss:0kB
  a.out (1169) used greatest stack depth: 11664 bytes left
  DEBUG_LOCKS_WARN_ON(depth <= 0)
  ------------[ cut here ]------------
  WARNING: CPU: 6 PID: 1339 at kernel/locking/lockdep.c:3617 lock_release+0x172/0x1e0
  CPU: 6 PID: 1339 Comm: a.out Not tainted 4.13.0-rc3-next-20170803+ #142
  Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
  RIP: 0010:lock_release+0x172/0x1e0
  Call Trace:
     up_read+0x1a/0x40
     __do_page_fault+0x28e/0x4c0
     do_page_fault+0x30/0x80
     page_fault+0x28/0x30

The reason is that the page fault path might have dropped the mmap_sem
and returned with VM_FAULT_RETRY.  MMF_UNSTABLE check however rewrites
the error path to VM_FAULT_SIGBUS and we always expect mmap_sem taken in
that path.  Fix this by taking mmap_sem when VM_FAULT_RETRY is held in
the MMF_UNSTABLE path.

We cannot simply add VM_FAULT_SIGBUS to the existing error code because
all arch specific page fault handlers and g-u-p would have to learn a
new error code combination.

Link: http://lkml.kernel.org/r/20170807113839.16695-2-mhocko@kernel.org
Fixes: 3f70dc38cec2 ("mm: make sure that kthreads will not refault oom reaped memory")
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Wenwei Tao <wenwei.tww@alibaba-inc.com>
Cc: <stable@vger.kernel.org>	[4.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e158f7ac6730..c717b5bcc80e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3910,8 +3910,18 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	 * further.
 	 */
 	if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-				&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+				&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
+
+		/*
+		 * We are going to enforce SIGBUS but the PF path might have
+		 * dropped the mmap_sem already so take it again so that
+		 * we do not break expectations of all arch specific PF paths
+		 * and g-u-p
+		 */
+		if (ret & VM_FAULT_RETRY)
+			down_read(&vma->vm_mm->mmap_sem);
 		ret = VM_FAULT_SIGBUS;
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 6b31d5955cb29a51c5baffee382f213d75e98fb8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 18 Aug 2017 15:16:15 -0700
Subject: mm, oom: fix potential data corruption when oom_reaper races with
 writer

Wenwei Tao has noticed that our current assumption that the oom victim
is dying and never doing any visible changes after it dies, and so the
oom_reaper can tear it down, is not entirely true.

__task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set
but do_group_exit sends SIGKILL to all threads _after_ the flag is set.
So there is a race window when some threads won't have
fatal_signal_pending while the oom_reaper could start unmapping the
address space.  Moreover some paths might not check for fatal signals
before each PF/g-u-p/copy_from_user.

We already have a protection for oom_reaper vs.  PF races by checking
MMF_UNSTABLE.  This has been, however, checked only for kernel threads
(use_mm users) which can outlive the oom victim.  A simple fix would be
to extend the current check in handle_mm_fault for all tasks but that
wouldn't be sufficient because the current check assumes that a kernel
thread would bail out after EFAULT from get_user*/copy_from_user and
never re-read the same address which would succeed because the PF path
has established page tables already.  This seems to be the case for the
only existing use_mm user currently (virtio driver) but it is rather
fragile in general.

This is even more fragile in general for more complex paths such as
generic_perform_write which can re-read the same address more times
(e.g.  iov_iter_copy_from_user_atomic to fail and then
iov_iter_fault_in_readable on retry).

Therefore we have to implement MMF_UNSTABLE protection in a robust way
and never make a potentially corrupted content visible.  That requires
to hook deeper into the PF path and check for the flag _every time_
before a pte for anonymous memory is established (that means all
!VM_SHARED mappings).

The corruption can be triggered artificially
(http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp)
but there doesn't seem to be any real life bug report.  The race window
should be quite tight to trigger most of the time.

Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Wenwei Tao <wenwei.tww@alibaba-inc.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 22 ++++++++++++++++++++++
 mm/huge_memory.c    | 30 ++++++++++++++++++++++--------
 mm/memory.c         | 46 ++++++++++++++++++++--------------------------
 3 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..76aac4ce39bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
+#include <linux/sched/coredump.h> /* MMF_* */
+#include <linux/mm.h> /* VM_FAULT* */
 
 struct zonelist;
 struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
+/*
+ * Checks whether a page fault on the given mm is still reliable.
+ * This is no longer true if the oom reaper started to reap the
+ * address space which is reflected by MMF_UNSTABLE flag set in
+ * the mm. At that moment any !shared mapping would lose the content
+ * and could cause a memory corruption (zero pages instead of the
+ * original content).
+ *
+ * User should call this before establishing a page table entry for
+ * a !shared mapping and under the proper page table lock.
+ *
+ * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
+ */
+static inline int check_stable_address_space(struct mm_struct *mm)
+{
+	if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+		return VM_FAULT_SIGBUS;
+	return 0;
+}
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 216114f6ef0b..90731e3b7e58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
+#include <linux/oom.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	int ret = 0;
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	pgtable = pte_alloc_one(vma->vm_mm, haddr);
 	if (unlikely(!pgtable)) {
-		mem_cgroup_cancel_charge(page, memcg, true);
-		put_page(page);
-		return VM_FAULT_OOM;
+		ret = VM_FAULT_OOM;
+		goto release;
 	}
 
 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_none(*vmf->pmd))) {
-		spin_unlock(vmf->ptl);
-		mem_cgroup_cancel_charge(page, memcg, true);
-		put_page(page);
-		pte_free(vma->vm_mm, pgtable);
+		goto unlock_release;
 	} else {
 		pmd_t entry;
 
+		ret = check_stable_address_space(vma->vm_mm);
+		if (ret)
+			goto unlock_release;
+
 		/* Deliver the page fault to userland */
 		if (userfaultfd_missing(vma)) {
 			int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 	}
 
 	return 0;
+unlock_release:
+	spin_unlock(vmf->ptl);
+release:
+	if (pgtable)
+		pte_free(vma->vm_mm, pgtable);
+	mem_cgroup_cancel_charge(page, memcg, true);
+	put_page(page);
+	return ret;
+
 }
 
 /*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		ret = 0;
 		set = false;
 		if (pmd_none(*vmf->pmd)) {
-			if (userfaultfd_missing(vma)) {
+			ret = check_stable_address_space(vma->vm_mm);
+			if (ret) {
+				spin_unlock(vmf->ptl);
+			} else if (userfaultfd_missing(vma)) {
 				spin_unlock(vmf->ptl);
 				ret = handle_userfault(vmf, VM_UFFD_MISSING);
 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memory.c b/mm/memory.c
index c717b5bcc80e..fe2fba27ded2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
+#include <linux/oom.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	struct mem_cgroup *memcg;
 	struct page *page;
+	int ret = 0;
 	pte_t entry;
 
 	/* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
 				vmf->address, &vmf->ptl);
 		if (!pte_none(*vmf->pte))
 			goto unlock;
+		ret = check_stable_address_space(vma->vm_mm);
+		if (ret)
+			goto unlock;
 		/* Deliver the page fault to userland, check inside PT lock */
 		if (userfaultfd_missing(vma)) {
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
 	if (!pte_none(*vmf->pte))
 		goto release;
 
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto release;
+
 	/* Deliver the page fault to userland, check inside PT lock */
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
 	update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-	return 0;
+	return ret;
 release:
 	mem_cgroup_cancel_charge(page, memcg, false);
 	put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 int finish_fault(struct vm_fault *vmf)
 {
 	struct page *page;
-	int ret;
+	int ret = 0;
 
 	/* Did we COW the page? */
 	if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
 		page = vmf->cow_page;
 	else
 		page = vmf->page;
-	ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+	/*
+	 * check even for read faults because we might have lost our CoWed
+	 * page
+	 */
+	if (!(vmf->vma->vm_flags & VM_SHARED))
+		ret = check_stable_address_space(vmf->vma->vm_mm);
+	if (!ret)
+		ret = alloc_set_pte(vmf, vmf->memcg, page);
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
@@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 			mem_cgroup_oom_synchronize(false);
 	}
 
-	/*
-	 * This mm has been already reaped by the oom reaper and so the
-	 * refault cannot be trusted in general. Anonymous refaults would
-	 * lose data and give a zero page instead e.g. This is especially
-	 * problem for use_mm() because regular tasks will just die and
-	 * the corrupted data will not be visible anywhere while kthread
-	 * will outlive the oom victim and potentially propagate the date
-	 * further.
-	 */
-	if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-				&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
-
-		/*
-		 * We are going to enforce SIGBUS but the PF path might have
-		 * dropped the mmap_sem already so take it again so that
-		 * we do not break expectations of all arch specific PF paths
-		 * and g-u-p
-		 */
-		if (ret & VM_FAULT_RETRY)
-			down_read(&vma->vm_mm->mmap_sem);
-		ret = VM_FAULT_SIGBUS;
-	}
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
-- 
cgit v1.2.3


From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie.iles@oracle.com>
Date: Fri, 18 Aug 2017 15:16:18 -0700
Subject: signal: don't remove SIGNAL_UNKILLABLE for traced tasks.

When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive
faults, but this is undesirable when tracing.  For example, debugging an
init process (whether global or namespace), hitting a breakpoint and
SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE.
Everything continues fine, but then once debugging has finished, the
init process is left killable which is unlikely what the user expects,
resulting in either an accidentally killed init or an init that stops
reaping zombies.

Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com
Signed-off-by: Jamie Iles <jamie.iles@oracle.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 7e33f8c583e6..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 			recalc_sigpending_and_wake(t);
 		}
 	}
-	if (action->sa.sa_handler == SIG_DFL)
+	/*
+	 * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+	 * debugging to leave init killable.
+	 */
+	if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
 		t->signal->flags &= ~SIGNAL_UNKILLABLE;
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
-- 
cgit v1.2.3


From da094e42848e3c36feaa3b5271e53983fd45424f Mon Sep 17 00:00:00 2001
From: Prakash Gupta <guptap@codeaurora.org>
Date: Fri, 18 Aug 2017 15:16:21 -0700
Subject: mm/cma_debug.c: fix stack corruption due to sprintf usage

name[] in cma_debugfs_add_one() can only accommodate 16 chars including
NULL to store sprintf output.  It's common for cma device name to be
larger than 15 chars.  This can cause stack corrpution.  If the gcc
stack protector is turned on, this can cause a panic due to stack
corruption.

Below is one example trace:

  Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in:
  ffffff8e69a75730
  Call trace:
     dump_backtrace+0x0/0x2c4
     show_stack+0x20/0x28
     dump_stack+0xb8/0xf4
     panic+0x154/0x2b0
     print_tainted+0x0/0xc0
     cma_debugfs_init+0x274/0x290
     do_one_initcall+0x5c/0x168
     kernel_init_freeable+0x1c8/0x280

Fix the short sprintf buffer in cma_debugfs_add_one() by using
scnprintf() instead of sprintf().

Link: http://lkml.kernel.org/r/1502446217-21840-1-git-send-email-guptap@codeaurora.org
Fixes: f318dd083c81 ("cma: Store a name in the cma structure")
Signed-off-by: Prakash Gupta <guptap@codeaurora.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 595b757bef72..c03ccbc405a0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
 	char name[16];
 	int u32s;
 
-	sprintf(name, "cma-%s", cma->name);
+	scnprintf(name, sizeof(name), "cma-%s", cma->name);
 
 	tmp = debugfs_create_dir(name, cma_debugfs_root);
 
-- 
cgit v1.2.3


From 73223e4e2e3867ebf033a5a8eb2e5df0158ccc99 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Fri, 18 Aug 2017 15:16:24 -0700
Subject: mm/mempolicy: fix use after free when calling get_mempolicy

I hit a use after free issue when executing trinity and repoduced it
with KASAN enabled.  The related call trace is as follows.

  BUG: KASan: use after free in SyS_get_mempolicy+0x3c8/0x960 at addr ffff8801f582d766
  Read of size 2 by task syz-executor1/798

  INFO: Allocated in mpol_new.part.2+0x74/0x160 age=3 cpu=1 pid=799
     __slab_alloc+0x768/0x970
     kmem_cache_alloc+0x2e7/0x450
     mpol_new.part.2+0x74/0x160
     mpol_new+0x66/0x80
     SyS_mbind+0x267/0x9f0
     system_call_fastpath+0x16/0x1b
  INFO: Freed in __mpol_put+0x2b/0x40 age=4 cpu=1 pid=799
     __slab_free+0x495/0x8e0
     kmem_cache_free+0x2f3/0x4c0
     __mpol_put+0x2b/0x40
     SyS_mbind+0x383/0x9f0
     system_call_fastpath+0x16/0x1b
  INFO: Slab 0xffffea0009cb8dc0 objects=23 used=8 fp=0xffff8801f582de40 flags=0x200000000004080
  INFO: Object 0xffff8801f582d760 @offset=5984 fp=0xffff8801f582d600

  Bytes b4 ffff8801f582d750: ae 01 ff ff 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a  ........ZZZZZZZZ
  Object ffff8801f582d760: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
  Object ffff8801f582d770: 6b 6b 6b 6b 6b 6b 6b a5                          kkkkkkk.
  Redzone ffff8801f582d778: bb bb bb bb bb bb bb bb                          ........
  Padding ffff8801f582d8b8: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ
  Memory state around the buggy address:
  ffff8801f582d600: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff8801f582d680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  >ffff8801f582d700: fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb fc

!shared memory policy is not protected against parallel removal by other
thread which is normally protected by the mmap_sem.  do_get_mempolicy,
however, drops the lock midway while we can still access it later.

Early premature up_read is a historical artifact from times when
put_user was called in this path see https://lwn.net/Articles/124754/
but that is gone since 8bccd85ffbaf ("[PATCH] Implement sys_* do_*
layering in the memory policy layer.").  but when we have the the
current mempolicy ref count model.  The issue was introduced
accordingly.

Fix the issue by removing the premature release.

Link: http://lkml.kernel.org/r/1502950924-27521-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>	[2.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..618ab125228b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
 	}
 
-	if (vma) {
-		up_read(&current->mm->mmap_sem);
-		vma = NULL;
-	}
-
 	err = 0;
 	if (nmask) {
 		if (mpol_store_user_nodemask(pol)) {
-- 
cgit v1.2.3


From 704b862f9efd6d4c87a8d0a344dda19bda9c6b69 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Fri, 18 Aug 2017 15:16:27 -0700
Subject: mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM

Commit 19809c2da28a ("mm, vmalloc: use __GFP_HIGHMEM implicitly") added
use of __GFP_HIGHMEM for allocations.  vmalloc_32 may use
GFP_DMA/GFP_DMA32 which does not play nice with __GFP_HIGHMEM and will
trigger a BUG in gfp_zone.

Only add __GFP_HIGHMEM if we aren't using GFP_DMA/GFP_DMA32.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1482249
Link: http://lkml.kernel.org/r/20170816220705.31374-1-labbott@redhat.com
Fixes: 19809c2da28a ("mm, vmalloc: use __GFP_HIGHMEM implicitly")
Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c86c4d..a47e3894c775 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-	const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
+	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
+					0 :
+					__GFP_HIGHMEM;
 
 	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
-		pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
+		pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
 				PAGE_KERNEL, node, area->caller);
 	} else {
 		pages = kmalloc_node(array_size, nested_gfp, node);
@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		}
 
 		if (node == NUMA_NO_NODE)
-			page = alloc_page(alloc_mask);
+			page = alloc_page(alloc_mask|highmem_mask);
 		else
-			page = alloc_pages_node(node, alloc_mask, 0);
+			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
 
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 			goto fail;
 		}
 		area->pages[i] = page;
-		if (gfpflags_allow_blocking(gfp_mask))
+		if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
 			cond_resched();
 	}
 
-- 
cgit v1.2.3


From c715b72c1ba406f133217b509044c38d8e714a37 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 18 Aug 2017 15:16:31 -0700
Subject: mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes

Moving the x86_64 and arm64 PIE base from 0x555555554000 to 0x000100000000
broke AddressSanitizer.  This is a partial revert of:

  eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE")
  02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB")

The AddressSanitizer tool has hard-coded expectations about where
executable mappings are loaded.

The motivation for changing the PIE base in the above commits was to
avoid the Stack-Clash CVEs that allowed executable mappings to get too
close to heap and stack.  This was mainly a problem on 32-bit, but the
64-bit bases were moved too, in an effort to proactively protect those
systems (proofs of concept do exist that show 64-bit collisions, but
other recent changes to fix stack accounting and setuid behaviors will
minimize the impact).

The new 32-bit PIE base is fine for ASan (since it matches the ET_EXEC
base), so only the 64-bit PIE base needs to be reverted to let x86 and
arm64 ASan binaries run again.  Future changes to the 64-bit PIE base on
these architectures can be made optional once a more dynamic method for
dealing with AddressSanitizer is found.  (e.g.  always loading PIE into
the mmap region for marked binaries.)

Link: http://lkml.kernel.org/r/20170807201542.GA21271@beast
Fixes: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE")
Fixes: 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB")
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Kostya Serebryany <kcc@google.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/elf.h | 4 ++--
 arch/x86/include/asm/elf.h   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index acae781f7359..3288c2b36731 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,10 +114,10 @@
 
 /*
  * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
  * space open for things that want to use the area for 32-bit pointers.
  */
-#define ELF_ET_DYN_BASE		0x100000000UL
+#define ELF_ET_DYN_BASE		(2 * TASK_SIZE_64 / 3)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..9aeb91935ce0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -247,11 +247,11 @@ extern int force_personality32;
 
 /*
  * This is the base location for PIE (ET_DYN with INTERP) loads. On
- * 64-bit, this is raised to 4GB to leave the entire 32-bit address
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
  * space open for things that want to use the area for 32-bit pointers.
  */
 #define ELF_ET_DYN_BASE		(mmap_is_ia32() ? 0x000400000UL : \
-						  0x100000000UL)
+						  (TASK_SIZE / 3 * 2))
 
 /* This yields a mask that user programs can use to figure out what
    instruction set this CPU supports.  This could be done in user space,
-- 
cgit v1.2.3