From f47ec3f28354795f000c14bf18ed967ec81a3ec3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Nov 2011 21:15:42 -0500 Subject: trim fs/internal.h some stuff in there can actually become static; some belongs to pnode.h as it's a private interface between namespace.c and pnode.c... Signed-off-by: Al Viro --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 36254645b7cc..3f64b9f26e7d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1225,7 +1225,7 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -int check_unsafe_exec(struct linux_binprm *bprm) +static int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; -- cgit v1.2.3 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 +++ fs/proc/base.c | 3 +++ include/trace/events/oom.h | 33 ++++++++++++++++++++++++ include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 +++++ mm/oom_kill.c | 6 +++++ 6 files changed, 113 insertions(+) create mode 100644 include/trace/events/oom.h create mode 100644 include/trace/events/task.h (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f26e7d..aeb135c7ff5c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include #include #include + +#include #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f2..1aab5fe05a1b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 000000000000..dd4ba3b92002 --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 000000000000..b53add02e929 --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index b00711ce7c13..5e1391b5ade0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce3..7c122faa05c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include #include #include +#include + +#define CREATE_TRACE_POINTS +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; -- cgit v1.2.3 From 96e02d1586782eadf051fa3d6bc4132d2447ac2c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 4 Feb 2012 10:47:10 +0100 Subject: exec: fix use-after-free bug in setup_new_exec() Setting the task name is done within setup_new_exec() by accessing bprm->filename. However this happens after flush_old_exec(). This may result in a use after free bug, flush_old_exec() may "complete" vfork_done, which will wake up the parent which in turn may free the passed in filename. To fix this add a new tcomm field in struct linux_binprm which contains the now early generated task name until it is used. Fixes this bug on s390: Unable to handle kernel pointer dereference at virtual kernel address 0000000039768000 Process kworker/u:3 (pid: 245, task: 000000003a3dc840, ksp: 0000000039453818) Krnl PSW : 0704000180000000 0000000000282e94 (setup_new_exec+0xa0/0x374) Call Trace: ([<0000000000282e2c>] setup_new_exec+0x38/0x374) [<00000000002dd12e>] load_elf_binary+0x402/0x1bf4 [<0000000000280a42>] search_binary_handler+0x38e/0x5bc [<0000000000282b6c>] do_execve_common+0x410/0x514 [<0000000000282cb6>] do_execve+0x46/0x58 [<00000000005bce58>] kernel_execve+0x28/0x70 [<000000000014ba2e>] ____call_usermodehelper+0x102/0x140 [<00000000005bc8da>] kernel_thread_starter+0x6/0xc [<00000000005bc8d4>] kernel_thread_starter+0x0/0xc Last Breaking-Event-Address: [<00000000002830f0>] setup_new_exec+0x2fc/0x374 Kernel panic - not syncing: Fatal exception: panic_on_oops Reported-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- fs/exec.c | 33 +++++++++++++++++---------------- include/linux/binfmts.h | 3 ++- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'fs/exec.c') diff --git a/fs/exec.c b/fs/exec.c index aeb135c7ff5c..92ce83a11e90 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1071,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } +static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) +{ + int i, ch; + + /* Copies the binary name from after last slash */ + for (i = 0; (ch = *(fn++)) != '\0';) { + if (ch == '/') + i = 0; /* overwrite what we wrote */ + else + if (i < len - 1) + tcomm[i++] = ch; + } + tcomm[i] = '\0'; +} + int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1085,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm) set_mm_exe_file(bprm->mm, bprm->file); + filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1116,10 +1132,6 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { - int i, ch; - const char *name; - char tcomm[sizeof(current->comm)]; - arch_pick_mmap_layout(current->mm); /* This is the point of no return */ @@ -1130,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - name = bprm->filename; - - /* Copies the binary name from after last slash */ - for (i=0; (ch = *(name++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < (sizeof(tcomm) - 1)) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; - set_task_comm(current, tcomm); + set_task_comm(current, bprm->tcomm); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fd88a3945aa1..0092102db2de 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -18,7 +18,7 @@ struct pt_regs; #define BINPRM_BUF_SIZE 128 #ifdef __KERNEL__ -#include +#include #define CORENAME_MAX_SIZE 128 @@ -58,6 +58,7 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + char tcomm[TASK_COMM_LEN]; }; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 -- cgit v1.2.3