diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/damon/dbgfs.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/hugetlb_vmemmap.c | 1 | ||||
-rw-r--r-- | mm/kmemleak.c | 61 | ||||
-rw-r--r-- | mm/kmsan/instrumentation.c | 1 | ||||
-rw-r--r-- | mm/kmsan/kmsan.h | 2 | ||||
-rw-r--r-- | mm/kmsan/shadow.c | 1 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 12 | ||||
-rw-r--r-- | mm/memory-failure.c | 5 | ||||
-rw-r--r-- | mm/memory-tiers.c | 8 | ||||
-rw-r--r-- | mm/memremap.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 7 | ||||
-rw-r--r-- | mm/mmap.c | 9 | ||||
-rw-r--r-- | mm/page_alloc.c | 1 | ||||
-rw-r--r-- | mm/page_isolation.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 17 | ||||
-rw-r--r-- | mm/slab_common.c | 24 | ||||
-rw-r--r-- | mm/userfaultfd.c | 27 |
20 files changed, 146 insertions, 57 deletions
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 6f0ae7d3ae39..b3f454a5c682 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -890,6 +890,7 @@ out: static int dbgfs_rm_context(char *name) { struct dentry *root, *dir, **new_dirs; + struct inode *inode; struct damon_ctx **new_ctxs; int i, j; int ret = 0; @@ -905,6 +906,12 @@ static int dbgfs_rm_context(char *name) if (!dir) return -ENOENT; + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), GFP_KERNEL); if (!new_dirs) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fc7e5edf07..811d19b5c4f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2206,9 +2206,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); - /* NOTE: this may set soft-dirty too on some archs */ - if (dirty) - entry = pte_mkdirty(entry); + /* + * NOTE: we don't do pte_mkdirty when dirty==true + * because it breaks sparc64 which can sigsegv + * random process. Need to revisit when we figure + * out what is special with sparc64. + */ if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) @@ -2462,7 +2465,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Fix up and warn once if private is unexpectedly set. */ if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); page_tail->private = 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546df97c31e4..e48f8ef45b17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); + ret = -EIO; + if (PageHWPoison(page)) + goto out_release_unlock; + /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba2a2596fb4e..4962dd1ba4a6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> +#include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1461,6 +1461,27 @@ static void scan_gray_list(void) } /* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,8 +1605,16 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -1632,8 +1647,16 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include <linux/gfp.h> +#include <linux/kmsan_string.h> #include <linux/mm.h> #include <linux/uaccess.h> diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 7019c46d33a7..a14744205435 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -124,6 +124,8 @@ static __always_inline bool kmsan_in_runtime(void) { if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) return true; + if (in_nmi()) + return true; return kmsan_get_context()->kmsan_in_runtime; } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/mm/maccess.c b/mm/maccess.c index 5f4d240f67ec..074f6b086671 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) return src - unsafe_addr; Efault: pagefault_enable(); - dst[-1] = '\0'; + dst[0] = '\0'; return -EFAULT; } diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145bb561ddb3..bead6bccc7f2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) int res; struct page *hpage = compound_head(p); struct address_space *mapping; + bool extra_pins = false; if (!PageHuge(hpage)) return MF_DELAYED; @@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; unlock_page(hpage); } else { unlock_page(hpage); @@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) } } - if (has_extra_refcount(ps, p, false)) + if (has_extra_refcount(ps, p, extra_pins)) res = MF_FAILED; return res; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; diff --git a/mm/memremap.c b/mm/memremap.c index 421bec3a29ee..08cbf54fe037 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -335,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } + params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: break; diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); diff --git a/mm/mmap.c b/mm/mmap.c index e270057ed04e..c3c5c1d6103d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ cannot_expand: error = -EINVAL; if (file) goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; else goto free_vma; } @@ -2682,6 +2684,8 @@ cannot_expand: error = -ENOMEM; if (file) goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; else goto free_vma; } @@ -2751,7 +2755,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) + if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); @@ -2852,6 +2856,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a6c815ae28..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 33b1886b06eb..0042fb2730d1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -941,7 +941,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(_RET_IP_, ret, size, + trace_kmalloc(caller, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; } @@ -953,7 +953,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); + trace_kmalloc(caller, ret, size, s->size, flags, node); return ret; } @@ -1010,7 +1010,7 @@ EXPORT_SYMBOL(kfree); /** * __ksize -- Report full size of underlying allocation - * @objp: pointer to the object + * @object: pointer to the object * * This should only be used internally to query the true size of allocations. * It is not meant to be a way to discover the usable size of an allocation @@ -1018,7 +1018,7 @@ EXPORT_SYMBOL(kfree); * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, * and/or FORTIFY_SOURCE. * - * Return: size of the actual memory used by @objp in bytes + * Return: size of the actual memory used by @object in bytes */ size_t __ksize(const void *object) { @@ -1040,7 +1040,6 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } -#ifdef CONFIG_TRACING void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, @@ -1064,7 +1063,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, return ret; } EXPORT_SYMBOL(kmalloc_node_trace); -#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -1411,20 +1409,6 @@ void kfree_sensitive(const void *p) } EXPORT_SYMBOL(kfree_sensitive); -/** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object - * - * kmalloc may internally round up allocations and return more memory - * than requested. ksize() can be used to determine the actual amount of - * memory allocated. The caller may use this additional memory, even though - * a smaller amount of memory was initially specified with the kmalloc call. - * The caller must guarantee that objp points to a valid object previously - * allocated with either kmalloc() or kmem_cache_alloc(). The object - * must not be freed during the duration of the call. - * - * Return: size of the actual memory used by @objp in bytes - */ size_t ksize(const void *objp) { size_t size; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..650ab6cfd5f4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -64,7 +64,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page->mapping; + bool page_in_cache = page_mapping(page); spinlock_t *ptl; struct inode *inode; pgoff_t offset, max_off; @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; |