diff options
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 15 | ||||
-rw-r--r-- | block/blk-throttle.c | 2 | ||||
-rw-r--r-- | block/cfq-iosched.c | 4 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 5 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 60 | ||||
-rw-r--r-- | include/linux/cgroup.h | 114 | ||||
-rw-r--r-- | include/linux/hugetlb_cgroup.h | 4 | ||||
-rw-r--r-- | include/linux/init_task.h | 8 | ||||
-rw-r--r-- | include/linux/jump_label.h | 18 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 8 | ||||
-rw-r--r-- | include/linux/sched.h | 12 | ||||
-rw-r--r-- | kernel/cgroup.c | 765 | ||||
-rw-r--r-- | kernel/cpuset.c | 70 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 |
15 files changed, 602 insertions, 512 deletions
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index e0975c2cf03d..176b940f8327 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -341,11 +341,11 @@ is riddled with issues. unnecessarily complicated and probably done this way because event delivery itself was expensive. -Unified hierarchy implements an interface file "cgroup.populated" -which can be used to monitor whether the cgroup's subhierarchy has -tasks in it or not. Its value is 0 if there is no task in the cgroup -and its descendants; otherwise, 1. poll and [id]notify events are -triggered when the value changes. +Unified hierarchy implements "populated" field in "cgroup.events" +interface file which can be used to monitor whether the cgroup's +subhierarchy has tasks in it or not. Its value is 0 if there is no +task in the cgroup and its descendants; otherwise, 1. poll and +[id]notify events are triggered when the value changes. This is significantly lighter and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can @@ -435,6 +435,11 @@ may be specified in any order and not all pairs have to be specified. the first entry in the file. Specific entries can use "default" as its value to indicate inheritance of the default value. +- For events which are not very high frequency, an interface file + "events" should be created which lists event key value pairs. + Whenever a notifiable event happens, file modified event should be + generated on the file. + 5-4. Per-Controller Changes diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c75a2636dd40..2149a1ddbacf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -369,7 +369,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; - if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 04de88463a98..1f9093e901da 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1581,7 +1581,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; if (cpd_to_blkcg(cpd) == &blkcg_root) @@ -1599,7 +1599,7 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd) static void cfq_cpd_bind(struct blkcg_policy_data *cpd) { struct blkcg *blkcg = cpd_to_blkcg(cpd); - bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; if (blkcg == &blkcg_root) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d5eb4ad1c534..08d9a8eac42c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/blkdev.h> #include <linux/writeback.h> -#include <linux/memcontrol.h> #include <linux/blk-cgroup.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> @@ -263,8 +262,8 @@ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); - return cgroup_on_dfl(mem_cgroup_root_css->cgroup) && - cgroup_on_dfl(blkcg_root_css->cgroup) && + return cgroup_subsys_on_dfl(memory_cgrp_subsys) && + cgroup_subsys_on_dfl(io_cgrp_subsys) && bdi_cap_account_dirty(bdi) && (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8492721b39be..df589a097539 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -76,6 +76,7 @@ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ @@ -83,6 +84,17 @@ enum { }; /* + * cgroup_file is the handle for a file instance created in a cgroup which + * is used, for example, to generate file changed notifications. This can + * be obtained by setting cftype->file_offset. + */ +struct cgroup_file { + /* do not access any fields from outside cgroup core */ + struct list_head node; /* anchored at css->files */ + struct kernfs_node *kn; +}; + +/* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. * @@ -122,6 +134,9 @@ struct cgroup_subsys_state { */ u64 serial_nr; + /* all cgroup_files associated with this css */ + struct list_head files; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -225,8 +240,8 @@ struct cgroup { int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ - struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ + struct cgroup_file procs_file; /* handle for "cgroup.procs" */ + struct cgroup_file events_file; /* handle for "cgroup.events" */ /* * The bitmask of subsystems enabled on the child cgroups. @@ -324,11 +339,6 @@ struct cftype { */ char name[MAX_CFTYPE_NAME]; unsigned long private; - /* - * If not 0, file mode is set to this value, otherwise it will - * be figured out automatically - */ - umode_t mode; /* * The maximum length of string, excluding trailing nul, that can @@ -340,6 +350,14 @@ struct cftype { unsigned int flags; /* + * If non-zero, should contain the offset from the start of css to + * a struct cgroup_file field. cgroup will record the handle of + * the created file into it. The recorded handle can be used as + * long as the containing css remains accessible. + */ + unsigned int file_offset; + + /* * Fields used for internal bookkeeping. Initialized automatically * during registration. */ @@ -419,7 +437,6 @@ struct cgroup_subsys { struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); - int disabled; int early_init; /* @@ -473,8 +490,31 @@ struct cgroup_subsys { unsigned int depends_on; }; -void cgroup_threadgroup_change_begin(struct task_struct *tsk); -void cgroup_threadgroup_change_end(struct task_struct *tsk); +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} #else /* CONFIG_CGROUPS */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index eb7ca55f72ef..e9c3eac074e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> +#include <linux/jump_label.h> #include <linux/cgroup-defs.h> @@ -50,6 +51,26 @@ extern struct css_set init_css_set; #include <linux/cgroup_subsys.h> #undef SUBSYS +#define SUBSYS(_x) \ + extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ + extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +/** + * cgroup_subsys_enabled - fast test on whether a subsys is enabled + * @ss: subsystem in question + */ +#define cgroup_subsys_enabled(ss) \ + static_branch_likely(&ss ## _enabled_key) + +/** + * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy + * @ss: subsystem in question + */ +#define cgroup_subsys_on_dfl(ss) \ + static_branch_likely(&ss ## _on_dfl_key) + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, @@ -211,11 +232,33 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @tset: taskset to iterate + * + * @tset may contain multiple tasks and they may belong to multiple + * processes. When there are multiple tasks in @tset, if a task of a + * process is in @tset, all tasks of the process are in @tset. Also, all + * are guaranteed to share the same source and destination csses. + * + * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) +/** + * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset + * @leader: the loop cursor + * @tset: takset to iterate + * + * Iterate threadgroup leaders of @tset. For single-task migrations, @tset + * may not contain any. + */ +#define cgroup_taskset_for_each_leader(leader, tset) \ + for ((leader) = cgroup_taskset_first((tset)); (leader); \ + (leader) = cgroup_taskset_next((tset))) \ + if ((leader) != (leader)->group_leader) \ + ; \ + else + /* * Inline functions. */ @@ -412,64 +455,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } -/** - * cgroup_on_dfl - test whether a cgroup is on the default hierarchy - * @cgrp: the cgroup of interest - * - * The default hierarchy is the v2 interface of cgroup and this function - * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the - * interface version. - * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * - * List of changed behaviors: - * - * - Mount options "noprefix", "xattr", "clone_children", "release_agent" - * and "name" are disallowed. - * - * - When mounting an existing superblock, mount options should match. - * - * - Remount is disallowed. - * - * - rename(2) is disallowed. - * - * - "tasks" is removed. Everything should be at process granularity. Use - * "cgroup.procs" instead. - * - * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. - * - * - "release_agent" and "notify_on_release" are removed. Replacement - * notification mechanism will be implemented. - * - * - "cgroup.clone_children" is removed. - * - * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup - * and its descendants contain no task; otherwise, 1. The file also - * generates kernfs notification which can be monitored through poll and - * [di]notify when the value of the file changes. - * - * - cpuset: tasks will be kept in empty cpusets when hotplug happens and - * take masks of ancestors with non-empty cpus/mems, instead of being - * moved to an ancestor. - * - * - cpuset: a task can be moved into an empty cpuset, and again it takes - * masks of ancestors. - * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * - * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. - */ -static inline bool cgroup_on_dfl(const struct cgroup *cgrp) -{ - return cgrp->root == &cgrp_dfl_root; -} - /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { @@ -527,6 +512,19 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +/** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +static inline void cgroup_file_notify(struct cgroup_file *cfile) +{ + /* might not have been created due to one of the CFTYPE selector flags */ + if (cfile->kn) + kernfs_notify(cfile->kn); +} + #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index bcc853eccc85..7edd30515298 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -48,9 +48,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_disabled(void) { - if (hugetlb_cgrp_subsys.disabled) - return true; - return false; + return !cgroup_subsys_enabled(hugetlb_cgrp_subsys); } extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e38681f4912d..d0b380ee7d67 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -64,7 +57,6 @@ extern struct fs_struct init_fs; INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7f653e8f6690..c9ca050de846 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,11 +216,6 @@ static inline int jump_label_apply_nops(struct module *mod) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled -static inline bool static_key_enabled(struct static_key *key) -{ - return static_key_count(key) > 0; -} - static inline void static_key_enable(struct static_key *key) { int count = static_key_count(key); @@ -267,6 +262,17 @@ struct static_key_false { #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT +extern bool ____wrong_branch_error(void); + +#define static_key_enabled(x) \ +({ \ + if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ + !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ + !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ + ____wrong_branch_error(); \ + static_key_count((struct static_key *)x) > 0; \ +}) + #ifdef HAVE_JUMP_LABEL /* @@ -325,8 +331,6 @@ struct static_key_false { * See jump_label_type() / jump_label_init_type(). */ -extern bool ____wrong_branch_error(void); - #define static_branch_likely(x) \ ({ \ bool branch; \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad800e62cb7a..c83c699a6605 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -213,6 +213,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* handle for "memory.events" */ + struct cgroup_file events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -286,6 +289,7 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, unsigned int nr) { this_cpu_add(memcg->stat->events[idx], nr); + cgroup_file_notify(&memcg->events_file); } bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -347,9 +351,7 @@ ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_disabled(void) { - if (memory_cgrp_subsys.disabled) - return true; - return false; + return !cgroup_subsys_enabled(memory_cgrp_subsys); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index b7b9501b41af..a4ab9daa387c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -762,18 +762,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2c9eae6ad970..f924158a1b65 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -46,6 +46,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/rwsem.h> +#include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> @@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); +struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -136,6 +139,27 @@ static const char *cgroup_subsys_name[] = { }; #undef SUBSYS +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ +#define SUBSYS(_x) \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, +static struct static_key_true *cgroup_subsys_enabled_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, +static struct static_key_true *cgroup_subsys_on_dfl_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are @@ -197,9 +221,81 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID + * @ssid: subsys ID of interest + * + * cgroup_subsys_enabled() can only be used with literal subsys names which + * is fine for individual subsystems but unsuitable for cgroup core. This + * is slower static_key_enabled() based test indexed by @ssid. + */ +static bool cgroup_ssid_enabled(int ssid) +{ + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); +} + +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ +static bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -516,8 +612,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - if (cgrp->populated_kn) - kernfs_notify(cgrp->populated_kn); + cgroup_file_notify(&cgrp->events_file); + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -871,48 +967,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -void cgroup_threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} - -void cgroup_threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. - */ -static void threadgroup_lock(struct task_struct *tsk) -{ - down_write(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock - * - * Reverse threadgroup_lock(). - */ -static inline void threadgroup_unlock(struct task_struct *tsk) -{ - up_write(&tsk->signal->group_rwsem); -} - static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1063,7 +1117,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1086,23 +1139,21 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander + * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->mode) - return cft->mode; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write) - mode |= S_IWUSR; + if (cft->write_u64 || cft->write_s64 || cft->write) { + if (cft->flags & CFTYPE_WORLD_WRITABLE) + mode |= S_IWUGO; + else + mode |= S_IWUSR; + } return mode; } @@ -1263,28 +1314,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - remove subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be removed + * css_clear_dir - remove subsys files in a cgroup directory + * @css: taget css + * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void css_clear_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) { - struct cgroup_subsys *ss; - int i; + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts; - for_each_subsys(ss, i) { - struct cftype *cfts; + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); +} - if (!(subsys_mask & (1 << i))) - continue; - list_for_each_entry(cfts, &ss->cfts, node) - cgroup_addrm_files(cgrp, cfts, false); +/** + * css_populate_dir - create subsys files in a cgroup directory + * @css: target css + * @cgrp_overried: specify if target cgroup is different from css->cgroup + * + * On failure, no file is added. + */ +static int css_populate_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) +{ + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts, *failed_cfts; + int ret; + + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_dfl_base_files; + else + cfts = cgroup_legacy_base_files; + + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + } + + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } + return 0; +err: + list_for_each_entry(cfts, &css->ss->cfts, node) { + if (cfts == failed_cfts) + break; + cgroup_addrm_files(css, cgrp, cfts, false); + } + return ret; } static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { + struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; unsigned long tmp_ss_mask; int ssid, i, ret; @@ -1306,10 +1393,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); - if (ret) { - if (dst_root != &cgrp_dfl_root) - return ret; + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + struct cgroup *scgrp = &ss->root->cgrp; + int tssid; + + ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); + if (!ret) + continue; /* * Rebinding back to the default root is not allowed to @@ -1317,57 +1407,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * be rare. Moving subsystems back and forth even more so. * Just warn about it and continue. */ - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + if (dst_root == &cgrp_dfl_root) { + if (cgrp_dfl_root_visible) { + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + } + continue; } + + for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + if (tssid == ssid) + break; + css_clear_dir(cgroup_css(scgrp, ss), dcgrp); + } + return ret; } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys_which(ss, ssid, &ss_mask) - cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - for_each_subsys_which(ss, ssid, &ss_mask) { - struct cgroup_root *src_root; - struct cgroup_subsys_state *css; + struct cgroup_root *src_root = ss->root; + struct cgroup *scgrp = &src_root->cgrp; + struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; - src_root = ss->root; - css = cgroup_css(&src_root->cgrp, ss); + WARN_ON(!css || cgroup_css(dcgrp, ss)); - WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); + css_clear_dir(css, NULL); - RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); - rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); + rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = &dst_root->cgrp; + css->cgroup = dcgrp; down_write(&css_set_rwsem); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], - &dst_root->cgrp.e_csets[ss->id]); + &dcgrp->e_csets[ss->id]); up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(&src_root->cgrp); + scgrp->subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) { - dst_root->cgrp.subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + if (dst_root == &cgrp_dfl_root) { + static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); + } else { + dcgrp->subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(dcgrp); + static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } if (ss->bind) ss->bind(css); } - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dcgrp->kn); return 0; } @@ -1497,7 +1597,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) for_each_subsys(ss, i) { if (strcmp(token, ss->legacy_name)) continue; - if (ss->disabled) + if (!cgroup_ssid_enabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -1528,7 +1628,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (!ss->disabled) + if (cgroup_ssid_enabled(i)) opts->subsys_mask |= (1 << i); /* @@ -1671,6 +1771,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); + INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1708,7 +1809,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1747,12 +1847,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - if (root == &cgrp_dfl_root) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(root_cgrp, base_files, true); + ret = css_populate_dir(&root_cgrp->self, NULL); if (ret) goto destroy_root; @@ -2049,6 +2144,49 @@ struct cgroup_taskset { struct task_struct *cur_task; }; +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +/** + * cgroup_taskset_add - try to add a migration target task to a taskset + * @task: target task + * @tset: target taskset + * + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. + */ +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) +{ + struct css_set *cset; + + lockdep_assert_held(&css_set_rwsem); + + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + return; + + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) + return; + + cset = task_css_set(task); + if (!cset->mg_src_cgrp) + return; + + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset->src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); +} + /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest @@ -2113,22 +2251,15 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, lockdep_assert_held(&css_set_rwsem); /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. + * We are synchronized through cgroup_threadgroup_rwsem against + * PF_EXITING setting such that we can't race against cgroup_exit() + * changing the css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - - /* - * Use move_tail so that cgroup_taskset_first() still returns the - * leader after migration. This works because cgroup_migrate() - * ensures that the dst_cset of the leader is the first on the - * tset's dst_csets list. - */ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* @@ -2140,6 +2271,84 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, } /** + * cgroup_taskset_migrate - migrate a taskset to a cgroup + * @tset: taget taskset + * @dst_cgrp: destination cgroup + * + * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the + * ->can_attach callbacks fails and guarantees that either all or none of + * the tasks in @tset are migrated. @tset is consumed regardless of + * success. + */ +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup *dst_cgrp) +{ + struct cgroup_subsys_state *css, *failed_css = NULL; + struct task_struct *task, *tmp_task; + struct css_set *cset, *tmp_cset; + int i, ret; + + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; + + /* check that we can legitimately attach to the cgroup */ + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->can_attach) { + ret = css->ss->can_attach(css, tset); + if (ret) { + failed_css = css; + goto out_cancel_attach; + } + } + } + + /* + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. + */ + down_write(&css_set_rwsem); + list_for_each_entry(cset, &tset->src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); + } + up_write(&css_set_rwsem); + + /* + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. + */ + tset->csets = &tset->dst_csets; + + for_each_e_css(css, i, dst_cgrp) + if (css->ss->attach) + css->ss->attach(css, tset); + + ret = 0; + goto out_release_tset; + +out_cancel_attach: + for_each_e_css(css, i, dst_cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, tset); + } +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset->dst_csets, &tset->src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + up_write(&css_set_rwsem); + return ret; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2172,10 +2381,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * @src_cset and add it to @preloaded_csets, which should later be cleaned * up by cgroup_migrate_finish(). * - * This function may be called without holding threadgroup_lock even if the - * target is a process. Threads may be created and destroyed but as long - * as cgroup_mutex is not dropped, no new css_set can be put into play and - * the preloaded css_sets are guaranteed to cover all migrations. + * This function may be called without holding cgroup_threadgroup_rwsem + * even if the target is a process. Threads may be created and destroyed + * but as long as cgroup_mutex is not dropped, no new css_set can be put + * into play and the preloaded css_sets are guaranteed to cover all + * migrations. */ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, @@ -2273,12 +2483,12 @@ err: /** * cgroup_migrate - migrate a process or task to a cgroup - * @cgrp: the destination cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task + * @cgrp: the destination cgroup * * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding threadgroup_lock of @leader. The + * process, the caller must be holding cgroup_threadgroup_rwsem. The * caller is also responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). @@ -2289,18 +2499,11 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) -{ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct cgroup_subsys_state *css, *failed_css = NULL; - struct css_set *cset, *tmp_cset; - struct task_struct *task, *tmp_task; - int i, ret; +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup *cgrp) +{ + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + struct task_struct *task; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2311,93 +2514,14 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, rcu_read_lock(); task = leader; do { - /* @task either already exited or can't exit until the end */ - if (task->flags & PF_EXITING) - goto next; - - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - goto next; - - cset = task_css_set(task); - if (!cset->mg_src_cgrp) - goto next; - - /* - * cgroup_taskset_first() must always return the leader. - * Take care to avoid disturbing the ordering. - */ - list_move_tail(&task->cg_list, &cset->mg_tasks); - if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset.src_csets); - if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset.dst_csets); - next: + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); up_write(&css_set_rwsem); - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset.src_csets)) - return 0; - - /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, cgrp) { - if (css->ss->can_attach) { - ret = css->ss->can_attach(css, &tset); - if (ret) { - failed_css = css; - goto out_cancel_attach; - } - } - } - - /* - * Now that we're guaranteed success, proceed to move all tasks to - * the new cgroup. There are no failure cases after here, so this - * is the commit point. - */ - down_write(&css_set_rwsem); - list_for_each_entry(cset, &tset.src_csets, mg_node) { - list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(cset->mg_src_cgrp, task, - cset->mg_dst_cset); - } - up_write(&css_set_rwsem); - - /* - * Migration is committed, all target tasks are now on dst_csets. - * Nothing is sensitive to fork() after this point. Notify - * controllers that migration is complete. - */ - tset.csets = &tset.dst_csets; - - for_each_e_css(css, i, cgrp) - if (css->ss->attach) - css->ss->attach(css, &tset); - - ret = 0; - goto out_release_tset; - -out_cancel_attach: - for_each_e_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } -out_release_tset: - down_write(&css_set_rwsem); - list_splice_init(&tset.dst_csets, &tset.src_csets); - list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { - list_splice_tail_init(&cset->mg_tasks, &cset->tasks); - list_del_init(&cset->mg_node); - } - up_write(&css_set_rwsem); - return ret; + return cgroup_taskset_migrate(&tset, cgrp); } /** @@ -2406,7 +2530,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and threadgroup_lock of @leader. + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2431,7 +2555,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2467,7 +2591,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, cgrp = cgroup_parent(cgrp); ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_kn); + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (inode) { ret = inode_permission(inode, MAY_WRITE); iput(inode); @@ -2498,14 +2622,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (!cgrp) return -ENODEV; -retry_find_task: + percpu_down_write(&cgroup_threadgroup_rwsem); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - rcu_read_unlock(); ret = -ESRCH; - goto out_unlock_cgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -2521,37 +2644,23 @@ retry_find_task: */ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; - rcu_read_unlock(); - goto out_unlock_cgroup; + goto out_unlock_rcu; } get_task_struct(tsk); rcu_read_unlock(); - threadgroup_lock(tsk); - if (threadgroup) { - if (!thread_group_leader(tsk)) { - /* - * a race with de_thread from another thread's exec() - * may strip us of our leadership, if this happens, - * there is no choice but to throw this task away and - * try again; this is - * "double-double-toil-and-trouble-check locking". - */ - threadgroup_unlock(tsk); - put_task_struct(tsk); - goto retry_find_task; - } - } - ret = cgroup_procs_write_permission(tsk, cgrp, of); if (!ret) ret = cgroup_attach_task(cgrp, tsk, threadgroup); - threadgroup_unlock(tsk); - put_task_struct(tsk); -out_unlock_cgroup: + goto out_unlock_threadgroup; + +out_unlock_rcu: + rcu_read_unlock(); +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2690,12 +2799,15 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *css; struct css_set *src_cset; int ret; lockdep_assert_held(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* look up all csses currently attached to @cgrp's subtree */ down_read(&css_set_rwsem); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { @@ -2716,61 +2828,24 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) if (ret) goto out_finish; + down_write(&css_set_rwsem); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { - struct task_struct *last_task = NULL, *task; + struct task_struct *task, *ntask; /* src_csets precede dst_csets, break on the first dst_cset */ if (!src_cset->mg_src_cgrp) break; - /* - * All tasks in src_cset need to be migrated to the - * matching dst_cset. Empty it process by process. We - * walk tasks but migrate processes. The leader might even - * belong to a different cset but such src_cset would also - * be among the target src_csets because the default - * hierarchy enforces per-process membership. - */ - while (true) { - down_read(&css_set_rwsem); - task = list_first_entry_or_null(&src_cset->tasks, - struct task_struct, cg_list); - if (task) { - task = task->group_leader; - WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); - get_task_struct(task); - } - up_read(&css_set_rwsem); - - if (!task) - break; - - /* guard against possible infinite loop */ - if (WARN(last_task == task, - "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) - goto out_finish; - last_task = task; - - threadgroup_lock(task); - /* raced against de_thread() from another thread? */ - if (!thread_group_leader(task)) { - threadgroup_unlock(task); - put_task_struct(task); - continue; - } - - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - - threadgroup_unlock(task); - put_task_struct(task); - - if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) - goto out_finish; - } + /* all tasks in src_csets need to be migrated */ + list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) + cgroup_taskset_add(task, &tset); } + up_write(&css_set_rwsem); + ret = cgroup_taskset_migrate(&tset, cgrp); out_finish: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -2797,7 +2872,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (!cgroup_ssid_enabled(ssid) || + strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { @@ -2921,7 +2997,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = create_css(child, ss, cgrp->subtree_control & (1 << ssid)); else - ret = cgroup_populate_dir(child, 1 << ssid); + ret = css_populate_dir(cgroup_css(child, ss), + NULL); if (ret) goto err_undo_css; } @@ -2954,7 +3031,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (css_disable & (1 << ssid)) { kill_css(css); } else { - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); if (ss->css_reset) ss->css_reset(css); } @@ -3002,15 +3079,16 @@ err_undo_css: if (css_enable & (1 << ssid)) kill_css(css); else - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); } } goto out_unlock; } -static int cgroup_populated_show(struct seq_file *seq, void *v) +static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + seq_printf(seq, "populated %d\n", + (bool)seq_css(seq)->cgroup->populated_cnt); return 0; } @@ -3153,7 +3231,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, + struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3175,33 +3254,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->write == cgroup_procs_write) - cgrp->procs_kn = kn; - else if (cft->seq_show == cgroup_populated_show) - cgrp->populated_kn = kn; + if (cft->file_offset) { + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + kernfs_get(kn); + cfile->kn = kn; + list_add(&cfile->node, &css->files); + } + return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory - * @cgrp: the target cgroup + * @css: the target css + * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * For removals, this function never fails. If addition fails, this - * function doesn't remove files already added. The caller is responsible - * for cleaning up. + * For removals, this function never fails. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add) { - struct cftype *cft; + struct cftype *cft, *cft_end = NULL; int ret; lockdep_assert_held(&cgroup_mutex); - for (cft = cfts; cft->name[0] != '\0'; cft++) { +restart: + for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; @@ -3213,11 +3297,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if (is_add) { - ret = cgroup_add_file(cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); - return ret; + cft_end = cft; + is_add = false; + goto restart; } } else { cgroup_rm_file(cgrp, cft); @@ -3243,7 +3329,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - ret = cgroup_addrm_files(cgrp, cfts, is_add); + ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } @@ -3355,7 +3441,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; - if (ss->disabled) + if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') @@ -3830,7 +3916,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(to, task, false); + ret = cgroup_migrate(task, false, to); put_task_struct(task); } } while (task && !ret); @@ -4327,13 +4413,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", + .file_offset = offsetof(struct cgroup, procs_file), .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.controllers", @@ -4351,9 +4437,10 @@ static struct cftype cgroup_dfl_base_files[] = { .write = cgroup_subtree_control_write, }, { - .name = "cgroup.populated", + .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_populated_show, + .file_offset = offsetof(struct cgroup, events_file), + .seq_show = cgroup_events_show, }, { } /* terminate */ }; @@ -4368,7 +4455,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.clone_children", @@ -4388,7 +4474,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup_tasks_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -4405,37 +4490,6 @@ static struct cftype cgroup_legacy_base_files[] = { { } /* terminate */ }; -/** - * cgroup_populate_dir - create subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be added - * - * On failure, no file is added. - */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i, ret = 0; - - /* process cftsets of each subsystem */ - for_each_subsys(ss, i) { - struct cftype *cfts; - - if (!(subsys_mask & (1 << i))) - continue; - - list_for_each_entry(cfts, &ss->cfts, node) { - ret = cgroup_addrm_files(cgrp, cfts, true); - if (ret < 0) - goto err; - } - } - return 0; -err: - cgroup_clear_dir(cgrp, subsys_mask); - return ret; -} - /* * css destruction is four-stage process. * @@ -4464,9 +4518,13 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); + list_for_each_entry(cfile, &css->files, node) + kernfs_put(cfile->kn); + if (ss) { /* css free path */ int id = css->id; @@ -4571,6 +4629,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { @@ -4653,7 +4712,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css->id = err; if (visible) { - err = cgroup_populate_dir(cgrp, 1 << ss->id); + err = css_populate_dir(css, NULL); if (err) goto err_free_id; } @@ -4679,7 +4738,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err_list_del: list_del_rcu(&css->sibling); - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: @@ -4696,7 +4755,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - struct cftype *base_files; int ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. @@ -4772,12 +4830,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - if (cgroup_on_dfl(cgrp)) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(cgrp, base_files, true); + ret = css_populate_dir(&cgrp->self, NULL); if (ret) goto out_destroy; @@ -4864,7 +4917,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); /* * Killing would put the base ref, but we need to keep it alive @@ -5083,6 +5136,7 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; + BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5116,7 +5170,7 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (ss->disabled) + if (!cgroup_ssid_enabled(ssid)) continue; cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -5251,7 +5305,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), !ss->disabled); + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); mutex_unlock(&cgroup_mutex); return 0; @@ -5541,7 +5596,7 @@ static int __init cgroup_disable(char *str) strcmp(token, ss->legacy_name)) continue; - ss->disabled = 1; + static_branch_disable(cgroup_subsys_enabled_key[i]); printk(KERN_INFO "Disabling %s control group subsystem\n", ss->name); break; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0acff0f66c9..e4d999929903 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_on_dfl(css->cgroup) && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; - struct mm_struct *mm; struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); + struct task_struct *leader; struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; @@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This is - * expensive and may sleep. + * Change mm for all threadgroup leaders. This is expensive and may + * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - mm = get_task_mm(leader); - if (mm) { - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - - /* - * old_mems_allowed is the same with mems_allowed here, except - * if this task is being moved automatically due to hotplug. - * In that case @mems_allowed has been updated and is empty, - * so @old_mems_allowed is the right nodesets that we migrate - * mm from. - */ - if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, - &cpuset_attach_nodemask_to); + cgroup_taskset_for_each_leader(leader, tset) { + struct mm_struct *mm = get_task_mm(leader); + + if (mm) { + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); + + /* + * old_mems_allowed is the same with mems_allowed + * here, except if this task is being moved + * automatically due to hotplug. In that case + * @mems_allowed has been updated and is empty, so + * @old_mems_allowed is the right nodesets that we + * migrate mm from. + */ + if (is_memory_migrate(cs)) { + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, + &cpuset_attach_nodemask_to); + } + mmput(mm); } - mmput(mm); } cs->old_mems_allowed = cpuset_attach_nodemask_to; @@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); break; @@ -1863,9 +1866,6 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - .mode = S_IRUGO, }, { @@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(cs->css.cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(root_css->cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2210,7 +2210,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_on_dfl(cs->css.cgroup)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); diff --git a/kernel/fork.c b/kernel/fork.c index 2845623fb582..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1149,10 +1149,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ddaeba34e09..33c8dad6830f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -434,7 +434,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) memcg = page->mem_cgroup; - if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) + if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; rcu_read_unlock(); @@ -4060,8 +4060,7 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, - .flags = CFTYPE_NO_PREFIX, - .mode = S_IWUGO, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, { .name = "swappiness", @@ -4829,7 +4828,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *from; - struct task_struct *p; + struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; @@ -4843,7 +4842,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, if (!move_flags) return 0; - p = cgroup_taskset_first(tset); + /* + * Multi-process migrations only happen on the default hierarchy + * where charge immigration is not used. Perform charge + * immigration if @tset contains a leader and whine if there are + * multiple. + */ + p = NULL; + cgroup_taskset_for_each_leader(leader, tset) { + WARN_ON_ONCE(p); + p = leader; + } + if (!p) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -5059,7 +5071,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_on_dfl(root_css->cgroup)) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) root_mem_cgroup->use_hierarchy = true; else root_mem_cgroup->use_hierarchy = false; @@ -5203,6 +5215,7 @@ static struct cftype memory_files[] = { { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, { } /* terminate */ |