diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 38 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 44 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 10 | ||||
-rw-r--r-- | drivers/char/hw_random/bcm2835-rng.c | 10 | ||||
-rw-r--r-- | fs/cifs/file.c | 2 | ||||
-rw-r--r-- | include/linux/mm_types.h | 3 | ||||
-rw-r--r-- | include/linux/slab.h | 11 | ||||
-rw-r--r-- | kernel/user_namespace.c | 11 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 183 | ||||
-rw-r--r-- | mm/slob.c | 10 | ||||
-rw-r--r-- | mm/slub.c | 5 | ||||
-rw-r--r-- | virt/kvm/ioapic.c | 25 |
19 files changed, 255 insertions, 128 deletions
@@ -1,7 +1,7 @@ VERSION = 3 -PATCHLEVEL = 14 +PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Shuffling Zombie Juror # *DOCUMENTATION* diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fcaf9c961265..7de069afb382 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -60,7 +60,7 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bea60671ef8a..f47a104a749c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -308,7 +308,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX); + F(ADX) | F(SMAP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a2a1bb7ed8c1..eeecbed26ac7 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -48,6 +48,14 @@ static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMAP)); +} + static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f5704d9e5ddc..813d31038b93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3601,20 +3601,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; - bool fault, x, w, u, wf, uf, ff, smep; + bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; - smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { pfec = byte << 1; map = 0; wf = pfec & PFERR_WRITE_MASK; uf = pfec & PFERR_USER_MASK; ff = pfec & PFERR_FETCH_MASK; + /* + * PFERR_RSVD_MASK bit is set in PFEC if the access is not + * subject to SMAP restrictions, and cleared otherwise. The + * bit is only meaningful if the SMAP bit is set in CR4. + */ + smapf = !(pfec & PFERR_RSVD_MASK); for (bit = 0; bit < 8; ++bit) { x = bit & ACC_EXEC_MASK; w = bit & ACC_WRITE_MASK; @@ -3626,12 +3633,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, /* Allow supervisor writes if !cr0.wp */ w |= !is_write_protection(vcpu) && !uf; /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + x &= !(cr4_smep && u && !uf); + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - An user page is accessed + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in + * permission_fault() and is in smapf. + * + * Also, SMAP does not affect instruction + * fetches, add the !ff check here to make it + * clearer. + */ + smap = cr4_smap && u && !uf && !ff; } else /* Not really needed: no U/S accesses on ept */ u = 1; - fault = (ff && !x) || (uf && !u) || (wf && !w); + fault = (ff && !x) || (uf && !u) || (wf && !w) || + (smapf && smap); map |= fault << bit; } mmu->permissions[byte] = map; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 292615274358..3842e70bdb7c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,11 +44,17 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); @@ -73,6 +79,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); +void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -110,10 +118,30 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? */ -static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, - unsigned pfec) +static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { - return (mmu->permissions[pfec >> 1] >> pte_access) & 1; + int cpl = kvm_x86_ops->get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * + * If CPL = 3, SMAP applies to all supervisor-mode data accesses + * (these are implicit supervisor accesses) regardless of the value + * of EFLAGS.AC. + * + * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving + * the result in X86_EFLAGS_AC. We then insert it in place of + * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, + * but it will be one in index if SMAP checks are being overridden. + * It is important to keep this branchless. + */ + unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); + int index = (pfec >> 1) + + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + + return (mmu->permissions[index] >> pte_access) & 1; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b1e6c1bf68d3..123efd3ec29f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,7 +353,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(mmu, pte_access, access))) { + if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1320e0f8e611..1f68c5831924 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3484,13 +3484,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; /* - * SMEP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode to + * SMEP/SMAP is disabled if CPU is in non-paging mode + * in hardware. However KVM always uses paging mode to * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP needs to be manually - * disabled when guest switches to non-paging mode. + * To emulate this behavior, SMEP/SMAP needs to be + * manually disabled when guest switches to non-paging + * mode. */ - hw_cr4 &= ~X86_CR4_SMEP; + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d1b5cd4d34c..8b8fc0b792ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -652,6 +652,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; @@ -680,6 +683,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_SMAP) + update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -1117,7 +1123,6 @@ static inline u64 get_kernel_ns(void) { struct timespec ts; - WARN_ON(preemptible()); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); return timespec_to_ns(&ts); @@ -4164,7 +4169,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, | (write ? PFERR_WRITE_MASK : 0); if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { + && !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c index 8c3b255e629a..e900961cdd2e 100644 --- a/drivers/char/hw_random/bcm2835-rng.c +++ b/drivers/char/hw_random/bcm2835-rng.c @@ -61,18 +61,18 @@ static int bcm2835_rng_probe(struct platform_device *pdev) } bcm2835_rng_ops.priv = (unsigned long)rng_base; + /* set warm-up count & enable */ + __raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS); + __raw_writel(RNG_RBGEN, rng_base + RNG_CTRL); + /* register driver */ err = hwrng_register(&bcm2835_rng_ops); if (err) { dev_err(dev, "hwrng registration failed\n"); iounmap(rng_base); - } else { + } else dev_info(dev, "hwrng registered\n"); - /* set warm-up count & enable */ - __raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS); - __raw_writel(RNG_RBGEN, rng_base + RNG_CTRL); - } return err; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8807442c94dd..8add25538a3b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2754,7 +2754,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; - size_t copy = min(remaining, PAGE_SIZE); + size_t copy = min_t(size_t, remaining, PAGE_SIZE); size_t written = copy_page_to_iter(page, 0, copy, iter); remaining -= written; if (written < copy && iov_iter_count(iter) > 0) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b58d192ea24..8967e20cbe57 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -124,6 +124,8 @@ struct page { union { struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! + * Can be used as a generic list + * by the page owner. */ struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ @@ -136,7 +138,6 @@ struct page { #endif }; - struct list_head list; /* slobs list of pages */ struct slab *slab_page; /* slab fields */ struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU diff --git a/include/linux/slab.h b/include/linux/slab.h index 3dd389aa91c7..307bfbe62387 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -242,6 +242,17 @@ struct kmem_cache { #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW) #endif +/* + * This restriction comes from byte sized index implementation. + * Page size is normally 2^12 bytes and, in this case, if we want to use + * byte sized index which can represent 2^8 entries, the size of the object + * should be equal or greater to 2^12 / 2^8 = 2^4 = 16. + * If minimum size of kmalloc is less than 16, we use it as minimum object + * size and give up to use byte sized index. + */ +#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ + (KMALLOC_MIN_SIZE) : 16) + #ifndef CONFIG_SLOB extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; #ifdef CONFIG_ZONE_DMA diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d8f6023fd8d..bf71b4b2d632 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); diff --git a/mm/shmem.c b/mm/shmem.c index 8f1a95406bae..9f70e02111c6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1411,7 +1411,7 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; - int error; + int error = 0; ssize_t retval; size_t count; loff_t *ppos = &iocb->ki_pos; diff --git a/mm/slab.c b/mm/slab.c index 3db4cb06e32e..388cb1ae6fbc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -157,6 +157,17 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) + /* * true if a page was allocated from pfmemalloc reserves for network-based * swap @@ -277,8 +288,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) @@ -565,9 +576,31 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) { - return ALIGN(nr_objs * sizeof(unsigned int), align); + int nr_objs; + size_t freelist_size; + + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = slab_size / (buffer_size + idx_size); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + freelist_size = slab_size - nr_objs * buffer_size; + if (freelist_size < ALIGN(nr_objs * idx_size, align)) + nr_objs--; + + return nr_objs; } /* @@ -600,25 +633,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, nr_objs = slab_size / buffer_size; } else { - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) - nr_objs--; - - mgmt_size = slab_mgmt_size(nr_objs, align); + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1067,7 +1084,7 @@ static int init_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { /* - * Set up the size64 kmemlist for cpu before we can + * Set up the kmem_cache_node for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ @@ -1076,12 +1093,12 @@ static int init_cache_node_node(int node) if (!n) return -ENOMEM; kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; /* - * The l3s don't come and go as CPUs come and - * go. slab_mutex is sufficient + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient * protection here. */ cachep->node[node] = n; @@ -1406,8 +1423,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) for_each_online_node(node) { cachep->node[node] = &init_kmem_cache_node[index + node]; cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; } } @@ -2010,6 +2027,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which @@ -2017,7 +2038,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * looping condition in cache_grow(). */ offslab_limit = size; - offslab_limit /= sizeof(unsigned int); + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2103,8 +2124,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2243,7 +2264,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2252,6 +2273,12 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); @@ -2259,7 +2286,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return -E2BIG; freelist_size = - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); + ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2272,7 +2299,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(unsigned int); + freelist_size = cachep->num * sizeof(freelist_idx_t); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2300,10 +2327,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* - * This is a possibility for one of the malloc_sizes caches. + * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, - * this should not happen at all. + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); @@ -2511,14 +2538,17 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) /* * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches. The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have freelist_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). */ static void *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, @@ -2542,9 +2572,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline unsigned int *slab_freelist(struct page *page) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx) { - return (unsigned int *)(page->freelist); + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned char idx, freelist_idx_t val) +{ + ((freelist_idx_t *)(page->freelist))[idx] = val; } static void cache_init_objs(struct kmem_cache *cachep, @@ -2589,7 +2625,7 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_freelist(page)[i] = i; + set_free_obj(page, i, i); } } @@ -2608,7 +2644,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, { void *objp; - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); @@ -2629,7 +2665,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { - if (slab_freelist(page)[i] == objnr) { + if (get_free_obj(page, i) == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2637,7 +2673,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; - slab_freelist(page)[page->active] = objnr; + set_free_obj(page, page->active, objnr); } /* @@ -2886,9 +2922,9 @@ retry: /* move slabp to correct slabp list: */ list_del(&page->lru); if (page->active == cachep->num) - list_add(&page->list, &n->slabs_full); + list_add(&page->lru, &n->slabs_full); else - list_add(&page->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); } must_grow: @@ -3245,11 +3281,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); - if (likely(ptr)) + if (likely(ptr)) { kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } return ptr; } @@ -3310,17 +3346,17 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) flags); prefetchw(objp); - if (likely(objp)) + if (likely(objp)) { kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } return objp; } /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) @@ -3574,11 +3610,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; - /* If you want to save a few bytes .text space: replace - * __ with kmem_. - * Then kmalloc uses the uninlined functions instead of the inline - * functions. - */ cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3670,7 +3701,7 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_cache_node *n; @@ -3726,8 +3757,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; n->shared = new_shared; n->alien = new_alien; n->free_limit = (1 + nr_cpus_node(node)) * @@ -3813,7 +3844,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep, gfp); + return alloc_kmem_cache_node(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3982,7 +4013,7 @@ static void cache_reap(struct work_struct *w) if (time_after(n->next_reap, jiffies)) goto next; - n->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE; drain_array(searchp, n, n->shared, 0, node); @@ -4003,7 +4034,7 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO @@ -4210,7 +4241,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, for (j = page->active; j < c->num; j++) { /* Skip freed item */ - if (slab_freelist(page)[j] == i) { + if (get_free_obj(page, j) == i) { active = false; break; } diff --git a/mm/slob.c b/mm/slob.c index 4bf8809dfcce..730cad45d4be 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->list, list); + list_add(&sp->lru, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->list); + list_del(&sp->lru); __ClearPageSlobFree(sp); } @@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, list) { + list_for_each_entry(sp, slob_list, lru) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) continue; /* Attempt to alloc */ - prev = sp->list.prev; + prev = sp->lru.prev; b = slob_page_alloc(sp, size, align); if (!b) continue; @@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->list); + INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); diff --git a/mm/slub.c b/mm/slub.c index f620bbf4054a..5e234f1f8853 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1352,11 +1352,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); + page = alloc_slab_page(alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1366,7 +1367,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index d4b601547f1f..2458a1dc2ba9 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -97,6 +97,14 @@ static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); } +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); + +static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic) +{ + if (WARN_ON(ioapic->rtc_status.pending_eoi < 0)) + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; @@ -120,9 +128,8 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) } else { __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); ioapic->rtc_status.pending_eoi--; + rtc_status_pending_eoi_check_valid(ioapic); } - - WARN_ON(ioapic->rtc_status.pending_eoi < 0); } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) @@ -149,10 +156,10 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) { - if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) + if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { --ioapic->rtc_status.pending_eoi; - - WARN_ON(ioapic->rtc_status.pending_eoi < 0); + rtc_status_pending_eoi_check_valid(ioapic); + } } static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) @@ -353,10 +360,16 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) ioapic->irr &= ~(1 << irq); if (irq == RTC_GSI && line_status) { + /* + * pending_eoi cannot ever become negative (see + * rtc_status_pending_eoi_check_valid) and the caller + * ensures that it is only called if it is >= zero, namely + * if rtc_irq_check_coalesced returns false). + */ BUG_ON(ioapic->rtc_status.pending_eoi != 0); ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, ioapic->rtc_status.dest_map); - ioapic->rtc_status.pending_eoi = ret; + ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); } else ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); |