diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-06-18 13:33:15 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-06-18 13:33:15 +1000 |
commit | 2854a11e7f83dd99c89d2261508c80358cf18183 (patch) | |
tree | 1c86cc6921357134e188f4a9cac1b65906607352 | |
parent | ad5157a45d59fb08514976209d87dae5ac1bb7e1 (diff) | |
parent | 522474ad8130cb802912d2d3e72d19fab55b50a0 (diff) |
Merge branch 'akpm/master'
34 files changed, 732 insertions, 409 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index d773d5697cf5..3187a18af6da 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -53,3 +53,11 @@ stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write request must not be changed until writeout is complete. + +strictlimit (read-write) + + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 02ab997a1ed2..5b3fdda3ad6b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -57,6 +57,7 @@ Brief summary of control files. memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap (See 5.5 for details) memory.limit_in_bytes # set/show limit of memory usage + memory.low_limit_in_bytes # set/show low limit for memory reclaim memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage memory.failcnt # show the number of memory usage hits limits memory.memsw.failcnt # show the number of memory+Swap hits limits @@ -236,23 +237,26 @@ it by cgroup. 2.5 Reclaim Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - -Note2: When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) +global VM. Cgroups can get reclaimed basically under two conditions + - under global memory pressure when all cgroups are reclaimed + proportionally wrt. their LRU size in a round robin fashion + - when a cgroup or its hierarchical parent (see 6. Hierarchical support) + hits hard limit. If the reclaim is unsuccessful, an OOM routine is invoked + to select and kill the bulkiest task in the hiearchy. (See 10. OOM Control + below.) + +Groups might be also protected from both global and limit reclaim by +low_limit_in_bytes knob. If the limit is non-zero the reclaim logic +doesn't include groups (and their subgroups - see 6. Hierarchy support) +which are bellow the low limit if there is other eligible cgroup in the +reclaimed hierarchy. If all groups which participate reclaim are under +their low limits then all of them are reclaimed and the low limit is +ignored. + +Note: When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered to the root +of the memory pressure which cannot be handled (See oom_control section) 2.6 Locking @@ -473,6 +477,9 @@ About use_hierarchy, see Section 6. write will still return success. In this case, it is expected that memory.kmem.usage_in_bytes == memory.usage_in_bytes. + Please note that this knob is considered deprecated and will be removed + in the future. + About use_hierarchy, see Section 6. 5.2 stat file diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126aceed8..223c32171dcc 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block <bool>: 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c139541deea5..2cecca0d2bbd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3145,6 +3145,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL] Should the soft-lockup detector generate panics. Format: <integer> + softlockup_all_cpu_backtrace= + [KNL] Should the soft-lockup detector generate + backtraces on all cpus. + Format: <integer> + sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 708bb7f1b7e0..c14374e71775 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -75,6 +75,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- softlockup_all_cpu_backtrace - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict @@ -783,6 +784,22 @@ via the /proc/sys interface: ============================================================== +softlockup_all_cpu_backtrace: + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt index 560e4363a55d..f609142f406a 100644 --- a/Documentation/vm/remap_file_pages.txt +++ b/Documentation/vm/remap_file_pages.txt @@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. -The plan is to deprecate the syscall and replace it with an emulation. -The emulation will create new VMAs instead of nonlinear mappings. It's -going to work slower for rare users of remap_file_pages() but ABI is -preserved. +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 375cffcf7dbd..91d219381306 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -89,7 +89,7 @@ static inline unsigned long get_softint(void) return retval; } -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace extern void *hardirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index b2988f25e230..027e09986194 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -239,7 +239,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -251,16 +251,22 @@ void arch_trigger_all_cpu_backtrace(void) spin_lock_irqsave(&global_cpu_snapshot_lock, flags); - memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); - this_cpu = raw_smp_processor_id(); - __global_reg_self(tp, regs, this_cpu); + memset(global_cpu_snapshot, 0, sizeof(global_cpu_snapshot)); + + if (include_self) + __global_reg_self(tp, regs, this_cpu); smp_fetch_global_regs(); for_each_online_cpu(cpu) { - struct global_reg_snapshot *gp = &global_cpu_snapshot[cpu].reg; + struct global_reg_snapshot *gp; + + if (!include_self && cpu == this_cpu) + continue; + + gp = &global_cpu_snapshot[cpu].reg; __global_reg_poll(gp); @@ -292,7 +298,7 @@ void arch_trigger_all_cpu_backtrace(void) static void sysrq_handle_globreg(int key) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index cb6cfcd034cf..a80cbb88ea91 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c3fcb5de5083..6a1e71bde323 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { int i; + int cpu = get_cpu(); - if (test_and_set_bit(0, &backtrace_flag)) + if (test_and_set_bit(0, &backtrace_flag)) { /* * If there is already a trigger_all_cpu_backtrace() in progress * (backtrace_flag == 1), don't output double cpu dump infos. */ + put_cpu(); return; + } cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); + touch_softlockup_watchdog(); } clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); + put_cpu(); } static int diff --git a/block/bio.c b/block/bio.c index 89c29942f4fd..5fb23bcf5069 100644 --- a/block/bio.c +++ b/block/bio.c @@ -753,29 +753,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the * queue to get further control @@ -792,23 +794,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + blk_recount_segments(q, bio); + return 0; } /** diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c index 54e54e4cc6c4..cd71e5769a76 100644 --- a/drivers/gpio/gpio-zevio.c +++ b/drivers/gpio/gpio-zevio.c @@ -18,6 +18,10 @@ #include <linux/slab.h> #include <linux/gpio.h> +#ifndef IOMEM +#define IOMEM(x) ((void __force __iomem *)(x)) +#endif + /* * Memory layout: * This chip has four gpio sections, each controls 8 GPIOs. diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 728039d2efe1..ec3fc9335f0d 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -92,9 +92,8 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, err = device_register(&dev->dev); if (err) { printk(KERN_ERR "Failed to register master device. err=%d\n", err); - memset(dev, 0, sizeof(struct w1_master)); - kfree(dev); - dev = NULL; + put_device(&dev->dev); + return NULL; } return dev; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 91ad9e1c9441..e26bc9a22ac9 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -303,6 +303,31 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } +static int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + + if (cluster < 0) + return cluster; + + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create) { @@ -311,7 +336,6 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -329,25 +353,39 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) + * Both ->mmu_private and ->i_disksize can access + * on only allocation path. (caller must hold ->i_mutex) */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) >> blocksize_bits; if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); +} + +int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, struct buffer_head *bh_result, + int create, sector_t *bmap) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + BUG_ON(create != 0); + + *bmap = 0; + *mapped_blocks = 0; + + last_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + + if (sector >= last_block) + return 0; + + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + bmap); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..13b7202bd651 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -119,7 +119,8 @@ struct msdos_inode_info { unsigned int cache_valid_id; /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ - loff_t mmu_private; /* physically allocated size */ + loff_t mmu_private; /* physically allocated size (initialized) */ + loff_t i_disksize; /* physically allocated size (uninitialized) */ int i_start; /* first cluster or 0 */ int i_logstart; /* logical first cluster */ @@ -290,6 +291,9 @@ extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create); +extern int fat_bmap2(struct inode *inode, sector_t sector, + unsigned long *mapped_blocks, + struct buffer_head *bh_result, int create, sector_t *bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..92e9e753b554 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -17,8 +17,12 @@ #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -182,6 +186,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -220,6 +225,75 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int cluster; + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if ((offset + len) <= MSDOS_I(inode)->i_disksize) + goto error; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + goto error; + + if (mode & FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - round_up(MSDOS_I(inode)->i_disksize, + sbi->cluster_size); + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_alloc_clusters(inode, &cluster, 1); + if (err) { + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_alloc_clusters() error"); + goto error; + } + err = fat_chain_add(inode, cluster, 1); + if (err) { + fat_free_clusters(inode, cluster); + goto error; + } + MSDOS_I(inode)->i_disksize += sbi->cluster_size; + } + } else { + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + if (err) + fat_msg(sb, KERN_ERR, + "fat_fallocate(): fat_cont_expand() error"); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -300,8 +374,10 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) * This protects against truncating a file bigger than it was then * trying to write into the hole. */ - if (MSDOS_I(inode)->mmu_private > offset) + if (MSDOS_I(inode)->i_disksize > offset) { MSDOS_I(inode)->mmu_private = offset; + MSDOS_I(inode)->i_disksize = offset; + } nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d96..c3b86c58ad88 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -116,6 +116,25 @@ static int fat_add_cluster(struct inode *inode) return err; } +static void check_fallocated_region(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, struct buffer_head *bh_result) +{ + struct super_block *sb = inode->i_sb; + sector_t last_block, disk_block; + const unsigned long blocksize = sb->s_blocksize; + const unsigned char blocksize_bits = sb->s_blocksize_bits; + + last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + disk_block = (MSDOS_I(inode)->i_disksize + (blocksize - 1)) + >> blocksize_bits; + if (iblock >= last_block && iblock <= disk_block) { + MSDOS_I(inode)->mmu_private += *max_blocks << blocksize_bits; + set_buffer_new(bh_result); + } + +} + static inline int __fat_get_block(struct inode *inode, sector_t iblock, unsigned long *max_blocks, struct buffer_head *bh_result, int create) @@ -130,8 +149,11 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, if (err) return err; if (phys) { - map_bh(bh_result, sb, phys); *max_blocks = min(mapped_blocks, *max_blocks); + if (create) + check_fallocated_region(inode, iblock, max_blocks, + bh_result); + map_bh(bh_result, sb, phys); return 0; } if (!create) @@ -155,6 +177,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + MSDOS_I(inode)->i_disksize = MSDOS_I(inode)->mmu_private; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); if (err) @@ -269,6 +292,13 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; + + /* + * In case of writing in fallocated region, return 0 and + * fallback to buffered write. + */ + if (MSDOS_I(inode)->i_disksize > MSDOS_I(inode)->mmu_private) + return 0; } /* @@ -282,13 +312,36 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + err = fat_bmap2(inode, iblock, &mapped_blocks, bh_result, create, + &bmap); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -469,7 +522,6 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) error = fat_calc_dir_size(inode); if (error < 0) return error; - MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); } else { /* not a directory */ @@ -484,8 +536,12 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_op = &fat_file_inode_operations; inode->i_fop = &fat_file_operations; inode->i_mapping->a_ops = &fat_aops; - MSDOS_I(inode)->mmu_private = inode->i_size; } + + MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = round_up(inode->i_size, + inode->i_sb->s_blocksize); + if (de->attr & ATTR_SYS) { if (sbi->options.sys_immutable) inode->i_flags |= S_IMMUTABLE; @@ -550,12 +606,34 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); + } else { + /* Release unwritten fallocated blocks on inode eviction. */ + if (MSDOS_I(inode)->i_disksize > + round_up(MSDOS_I(inode)->mmu_private, + inode->i_sb->s_blocksize)) { + int err; + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused fallocated " + "blocks, inode could be corrupted. Please run " + "fsck"); + } + } } invalidate_inode_buffers(inode); clear_inode(inode); @@ -1293,6 +1371,7 @@ static int fat_read_root(struct inode *inode) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; MSDOS_I(inode)->mmu_private = inode->i_size; + MSDOS_I(inode)->i_disksize = inode->i_size; fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index e11d60cc867b..6cf9693ea62d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2414,8 +2414,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eb65d29516ca..a5cf853129ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,10 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, bool task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); +extern bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root); +extern bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root); + extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -288,6 +292,16 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, return &zone->lruvec; } +static inline bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + return false; +} +static inline bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root) +{ + return false; +} + static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { return NULL; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 6a45fb583ff1..447775ee2c4b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void) #ifdef arch_trigger_all_cpu_backtrace static inline bool trigger_all_cpu_backtrace(void) { - arch_trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(true); return true; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + arch_trigger_all_cpu_backtrace(false); + return true; +} #else static inline bool trigger_all_cpu_backtrace(void) { return false; } +static inline bool trigger_allbutself_cpu_backtrace(void) +{ + return false; +} #endif #ifdef CONFIG_LOCKUP_DETECTOR @@ -48,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_user_enabled; extern int watchdog_thresh; +extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 56b7bc32db4f..b810855024f9 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -40,6 +40,11 @@ struct res_counter { */ unsigned long long soft_limit; /* + * the limit under which the usage cannot be pushed + * due to external pressure. + */ + unsigned long long low_limit; + /* * the number of unsuccessful attempts to consume the resource */ unsigned long long failcnt; @@ -88,6 +93,7 @@ enum { RES_LIMIT, RES_FAILCNT, RES_SOFT_LIMIT, + RES_LOW_LIMIT, }; /* @@ -175,6 +181,28 @@ res_counter_soft_limit_excess(struct res_counter *cnt) return excess; } +/** + * Get the difference between the usage and the low limit + * @cnt: The counter + * + * Returns 0 if usage is less than or equal to low limit + * The difference between usage and low limit, otherwise. + */ +static inline unsigned long long +res_counter_low_limit_excess(struct res_counter *cnt) +{ + unsigned long long excess; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + if (cnt->usage <= cnt->low_limit) + excess = 0; + else + excess = cnt->usage - cnt->low_limit; + spin_unlock_irqrestore(&cnt->lock, flags); + return excess; +} + static inline void res_counter_reset_max(struct res_counter *cnt) { unsigned long flags; @@ -220,4 +248,16 @@ res_counter_set_soft_limit(struct res_counter *cnt, return 0; } +static inline int +res_counter_set_low_limit(struct res_counter *cnt, + unsigned long long low_limit) +{ + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + cnt->low_limit = low_limit; + spin_unlock_irqrestore(&cnt->lock, flags); + return 0; +} + #endif diff --git a/init/main.c b/init/main.c index e8ae1fef0908..bb1aed928f21 100644 --- a/init/main.c +++ b/init/main.c @@ -6,7 +6,7 @@ * GK 2/5/95 - Changed to support mounting root fs via NFS * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 - * Simplified starting of init: Michael A. Griffith <grif@acm.org> + * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ #define DEBUG /* Enable initcall_debug */ @@ -136,7 +136,7 @@ static char *ramdisk_execute_command; * Used to generate warnings if static_key manipulation functions are used * before jump_label_init is called. */ -bool static_key_initialized __read_mostly = false; +bool static_key_initialized __read_mostly; EXPORT_SYMBOL_GPL(static_key_initialized); /* @@ -159,8 +159,8 @@ static int __init set_reset_devices(char *str) __setup("reset_devices", set_reset_devices); -static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; @@ -199,7 +199,6 @@ static int __init obsolete_checksetup(char *line) * still work even if initially too large, it will just take slightly longer */ unsigned long loops_per_jiffy = (1<<12); - EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) @@ -376,8 +375,8 @@ static void __init setup_command_line(char *command_line) initcall_command_line = memblock_virt_alloc(strlen(boot_command_line) + 1, 0); static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); - strcpy (saved_command_line, boot_command_line); - strcpy (static_command_line, command_line); + strcpy(saved_command_line, boot_command_line); + strcpy(static_command_line, command_line); } /* @@ -445,8 +444,8 @@ void __init parse_early_options(char *cmdline) /* Arch code calls this early on, or if not, just before other parsing. */ void __init parse_early_param(void) { - static __initdata int done = 0; - static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; + static int done __initdata; + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; if (done) return; @@ -500,7 +499,8 @@ static void __init mm_init(void) asmlinkage __visible void __init start_kernel(void) { - char * command_line, *after_dashes; + char *command_line; + char *after_dashes; extern const struct kernel_param __start___param[], __stop___param[]; /* @@ -572,7 +572,8 @@ asmlinkage __visible void __init start_kernel(void) * fragile until we cpu_idle() for the first time. */ preempt_disable(); - if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) + if (WARN(!irqs_disabled(), + "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); idr_init_cache(); rcu_init(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3214289df5a7..2ac9f133c4da 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -76,7 +76,7 @@ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance); static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; @@ -297,9 +297,9 @@ static inline void reset_kprobe_instance(void) /* * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c + * - under the kprobe_mutex - during kprobe_[un]register() + * OR + * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ struct kprobe *get_kprobe(void *addr) { @@ -567,7 +567,8 @@ static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); - while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + while (!list_empty(&optimizing_list) || + !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); /* this will also make optimizing_work execute immmediately */ @@ -676,8 +677,8 @@ static void reuse_unused_kprobe(struct kprobe *ap) */ op = container_of(ap, struct optimized_kprobe, kp); if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + pr_warn("Warning: found a stray unused aggrprobe@%p\n", + ap->addr); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -794,7 +795,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } - printk(KERN_INFO "Kprobes globally optimized\n"); + pr_info("Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); } @@ -824,7 +825,7 @@ static void unoptimize_all_kprobes(void) /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); + pr_info("Kprobes globally unoptimized\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); @@ -896,7 +897,7 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) /* There should be no unused kprobes can be reused without optimization */ static void reuse_unused_kprobe(struct kprobe *ap) { - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + pr_err("Error: There should be no unused kprobe here.\n"); BUG_ON(kprobe_unused(ap)); } @@ -955,7 +956,8 @@ static void disarm_kprobe_ftrace(struct kprobe *p) } ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", + p->addr, ret); } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) @@ -1389,7 +1391,7 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) if (p != ap) { list_for_each_entry_rcu(list_p, &ap->list, list) if (list_p == p) - /* kprobe p is a valid probe */ + /* kprobe p is a valid probe */ goto valid; return NULL; } @@ -2018,8 +2020,8 @@ EXPORT_SYMBOL_GPL(enable_kprobe); void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + pr_warn("Dumping kprobe:\n"); + pr_warn("Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2128,7 +2130,7 @@ static int __init init_kprobes(void) kprobe_lookup_name(kretprobe_blacklist[i].name, kretprobe_blacklist[i].addr); if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", + pr_warn("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); } } @@ -2310,7 +2312,7 @@ static void arm_all_kprobes(void) } kprobes_all_disarmed = false; - printk(KERN_INFO "Kprobes globally enabled\n"); + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); @@ -2332,7 +2334,7 @@ static void disarm_all_kprobes(void) } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); + pr_info("Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e791130f85a7..85ea94305015 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -136,6 +136,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->failcnt; case RES_SOFT_LIMIT: return &counter->soft_limit; + case RES_LOW_LIMIT: + return &counter->low_limit; }; BUG(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 075d1903138f..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..5d2c48f5083b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -244,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -271,6 +289,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,7 +336,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -327,6 +357,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -445,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..1eaa70bed7a4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..382f8ece1cb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -234,11 +234,46 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strictlimit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int val; + ssize_t ret; + + ret = kstrtouint(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + break; + case 1: + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + break; + default: + return -EINVAL; + } + + return count; +} +static ssize_t strictlimit_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strictlimit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strictlimit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 72b8fa361433..000000000000 --- a/mm/fremap.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * linux/mm/fremap.c - * - * Explicit pagetable population and nonlinear (random) mappings support. - * - * started by Ingo Molnar, Copyright (C) 2002, 2003 - */ -#include <linux/export.h> -#include <linux/backing-dev.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/file.h> -#include <linux/mman.h> -#include <linux/pagemap.h> -#include <linux/swapops.h> -#include <linux/rmap.h> -#include <linux/syscalls.h> -#include <linux/mmu_notifier.h> - -#include <asm/mmu_context.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> - -#include "internal.h" - -static int mm_counter(struct page *page) -{ - return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; -} - -static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - struct page *page; - swp_entry_t entry; - - if (pte_present(pte)) { - flush_cache_page(vma, addr, pte_pfn(pte)); - pte = ptep_clear_flush(vma, addr, ptep); - page = vm_normal_page(vma, addr, pte); - if (page) { - if (pte_dirty(pte)) - set_page_dirty(page); - update_hiwater_rss(mm); - dec_mm_counter(mm, mm_counter(page)); - page_remove_rmap(page); - page_cache_release(page); - } - } else { /* zap_pte() is not called when pte_none() */ - if (!pte_file(pte)) { - update_hiwater_rss(mm); - entry = pte_to_swp_entry(pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - dec_mm_counter(mm, mm_counter(page)); - } - } else { - free_swap_and_cache(entry); - dec_mm_counter(mm, MM_SWAPENTS); - } - } - pte_clear_not_present_full(mm, addr, ptep, 0); - } -} - -/* - * Install a file pte to a given virtual memory address, release any - * previously existing mapping. - */ -static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) -{ - int err = -ENOMEM; - pte_t *pte, ptfile; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - ptfile = pgoff_to_pte(pgoff); - - if (!pte_none(*pte)) - zap_pte(mm, vma, addr, pte); - - set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); - /* - * We don't need to run update_mmu_cache() here because the "file pte" - * being installed by install_file_pte() is not a real pte - it's a - * non-present entry (like a swap entry), noting what file offset should - * be mapped there when there's a fault (in a non-linear vma where - * that's not obvious). - */ - pte_unmap_unlock(pte, ptl); - err = 0; -out: - return err; -} - -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - struct mm_struct *mm = vma->vm_mm; - int err; - - do { - err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); - if (err) - return err; - - size -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } while (size); - - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - -/** - * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma - * @start: start of the remapped virtual memory range - * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range (see NOTE) - * @pgoff: to-be-mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. - * - * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma - * (shared backing store file). - * - * This syscall works purely via pagetables, so it's the most efficient - * way to map the same (large) file into a given virtual window. Unlike - * mmap()/mremap() it does not create any new vmas. The new mappings are - * also safe across swapout. - * - * NOTE: the @prot parameter right now is ignored (but must be zero), - * and the vma's default protection is used. Arbitrary protections - * might be implemented in the future. - */ -SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, - unsigned long, prot, unsigned long, pgoff, unsigned long, flags) -{ - struct mm_struct *mm = current->mm; - struct address_space *mapping; - struct vm_area_struct *vma; - int err = -EINVAL; - int has_write_lock = 0; - vm_flags_t vm_flags = 0; - - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); - - if (prot) - return err; - /* - * Sanitize the syscall parameters: - */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; - - /* Does the address range wrap, or is the span zero-sized? */ - if (start + size <= start) - return err; - - /* Does pgoff wrap? */ - if (pgoff + (size >> PAGE_SHIFT) < pgoff) - return err; - - /* Can we represent this offset inside this architecture's pte's? */ -#if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) - return err; -#endif - - /* We need down_write() to change vma->vm_flags. */ - down_read(&mm->mmap_sem); - retry: - vma = find_vma(mm, start); - - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma. - */ - if (!vma || !(vma->vm_flags & VM_SHARED)) - goto out; - - if (!vma->vm_ops || !vma->vm_ops->remap_pages) - goto out; - - if (start < vma->vm_start || start + size > vma->vm_end) - goto out; - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (!(vma->vm_flags & VM_NONLINEAR)) { - /* - * vm_private_data is used as a swapout cursor - * in a VM_NONLINEAR vma. - */ - if (vma->vm_private_data) - goto out; - - /* Don't need a nonlinear mapping, exit success */ - if (pgoff == linear_page_index(vma, start)) { - err = 0; - goto out; - } - - if (!has_write_lock) { -get_write_lock: - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - /* - * page_mkclean doesn't work on nonlinear vmas, so if - * dirty pages need to be accounted, emulate with linear - * vmas. - */ - if (mapping_cap_account_dirty(mapping)) { - unsigned long addr; - struct file *file = get_file(vma->vm_file); - /* mmap_region may free vma; grab the info now */ - vm_flags = vma->vm_flags; - - addr = mmap_region(file, start, size, vm_flags, pgoff); - fput(file); - if (IS_ERR_VALUE(addr)) { - err = addr; - } else { - BUG_ON(addr != start); - err = 0; - } - goto out_freed; - } - mutex_lock(&mapping->i_mmap_mutex); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_interval_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); - } - - if (vma->vm_flags & VM_LOCKED) { - /* - * drop PG_Mlocked flag for over-mapped range - */ - if (!has_write_lock) - goto get_write_lock; - vm_flags = vma->vm_flags; - munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = vm_flags; - } - - mmu_notifier_invalidate_range_start(mm, start, start + size); - err = vma->vm_ops->remap_pages(vma, start, size, pgoff); - mmu_notifier_invalidate_range_end(mm, start, start + size); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - -out: - if (vma) - vm_flags = vma->vm_flags; -out_freed: - if (likely(!has_write_lock)) - up_read(&mm->mmap_sem); - else - up_write(&mm->mmap_sem); - if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) - mm_populate(start, size); - - return err; -} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 48d4cfdb2793..2a18af6d9cf4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1666,8 +1666,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("memory: usage %llukB, low_limit %llukB limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LOW_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_FAILCNT)); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", @@ -2779,6 +2780,46 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/** + * mem_cgroup_within_guarantee - checks whether given memcg is within its + * memory guarantee + * @memcg: target memcg for the reclaim + * @root: root of the reclaim hierarchy (null for the global reclaim) + * + * The given group is within its reclaim gurantee if it is below its low limit + * or the same applies for any parent up the hierarchy until root (including). + * Such a group might be excluded from the reclaim. + */ +bool mem_cgroup_within_guarantee(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + if (mem_cgroup_disabled()) + return false; + + do { + if (!res_counter_low_limit_excess(&memcg->res)) + return true; + if (memcg == root) + break; + + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} + +bool mem_cgroup_all_within_guarantee(struct mem_cgroup *root) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, root) + if (!mem_cgroup_within_guarantee(iter, root)) { + mem_cgroup_iter_break(root, iter); + return false; + } + + return true; +} + struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -4773,6 +4814,10 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, if (mem_cgroup_is_root(memcg)) return -EINVAL; + pr_info_once("%s (%d): memory.force_empty is deprecated and will be " + "removed. Let us know if it is needed in your usecase at " + "linux-mm@kvack.org\n", + current->comm, task_pid_nr(current)); return mem_cgroup_force_empty(memcg) ?: nbytes; } @@ -5056,6 +5101,24 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, else return -EINVAL; break; + case RES_LOW_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } + ret = res_counter_memparse_write_strategy(buf, &val); + if (ret) + break; + if (type == _MEM) { + ret = res_counter_set_low_limit(&memcg->res, val); + break; + } + /* + * memsw low limit doesn't make any sense and kmem is not + * implemented yet - if ever + */ + return -EINVAL; + case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buf, &val); if (ret) @@ -5982,6 +6045,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read_u64, }, { + .name = "low_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_LOW_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write = mem_cgroup_write, @@ -5999,6 +6068,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", + .flags = CFTYPE_INSANE, .write = mem_cgroup_force_empty_write, }, { diff --git a/mm/mmap.c b/mm/mmap.c index 129b847d30cc..af82b67c9f4d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2581,6 +2581,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) return vm_munmap(addr, len); } + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM diff --git a/mm/nommu.c b/mm/nommu.c index 4a852f6c5709..6cc583a131dd 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1999,14 +1999,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_map_pages); -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL(generic_file_remap_pages); - static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { diff --git a/mm/page_io.c b/mm/page_io.c index 955db8b0d497..f864ef3d3f99 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -277,6 +277,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; from.bvec = &bv; /* older gcc versions are broken */ + /* Do this by hand because old gcc messes up the initializer */ + from.bvec = &bv; + init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); kiocb.ki_nbytes = PAGE_SIZE; diff --git a/mm/vmscan.c b/mm/vmscan.c index 967bc7ea239d..b19ebb3a666b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2231,9 +2231,21 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +/** + * __shrink_zone - shrinks a given zone + * + * @zone: zone to shrink + * @sc: scan control with additional reclaim parameters + * @honor_memcg_guarantee: do not reclaim memcgs which are within their memory + * guarantee + * + * Returns the number of reclaimed memcgs. + */ +static unsigned __shrink_zone(struct zone *zone, struct scan_control *sc, + bool honor_memcg_guarantee) { unsigned long nr_reclaimed, nr_scanned; + unsigned nr_scanned_groups = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2250,7 +2262,20 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) do { struct lruvec *lruvec; + /* Memcg might be protected from the reclaim */ + if (honor_memcg_guarantee && + mem_cgroup_within_guarantee(memcg, root)) { + /* + * It would be more optimal to skip the memcg + * subtree now but we do not have a memcg iter + * helper for that. Anyone? + */ + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + nr_scanned_groups++; sc->swappiness = mem_cgroup_swappiness(memcg); shrink_lruvec(lruvec, sc); @@ -2279,6 +2304,27 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return nr_scanned_groups; +} + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ + bool honor_guarantee = true; + + while (!__shrink_zone(zone, sc, honor_guarantee)) { + /* + * The previous round of reclaim didn't find anything to scan + * because + * a) the whole reclaimed hierarchy is within guarantee so + * we fallback to ignore the guarantee because other option + * would be the OOM + * b) multiple reclaimers are racing and so the first round + * should be retried + */ + if (mem_cgroup_all_within_guarantee(sc->target_mem_cgroup)) + honor_guarantee = false; + } } /* Returns true if compaction should go ahead for a high-order request */ |