From eda4623cf989d033c07355be1469e44f321ce8ae Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 May 2025 16:27:57 +0100 Subject: io_uring/zcrx: init id for xa_find xa_find() interprets id as the lower bound and thus expects it initialised. Reported-by: syzbot+c3ff04150c30d3df0f57@syzkaller.appspotmail.com Fixes: 76f1cc98b23ce ("io_uring/zcrx: add support for multiple ifqs") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/faea44ef63131e6968f635e1b6b7ca6056f1f533.1748359655.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 9a568d049204..0c5b7d8f8d67 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -630,12 +630,13 @@ ifq_free: void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; - unsigned long id; lockdep_assert_held(&ctx->uring_lock); while (1) { scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); if (ifq) xa_erase(&ctx->zcrx_ctxs, id); -- cgit v1.2.3 From 0ec33c81d9c7342f03864101ddb2e717a0cce03e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 May 2025 18:07:33 +0100 Subject: io_uring/zcrx: fix area release on registration failure On area registration failure there might be no ifq set and it's not safe to access area->ifq in the release path without checking it first. Cc: stable@vger.kernel.org Fixes: f12ecf5e1c5ec ("io_uring/zcrx: fix late dma unmap for a dead dev") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bc02878678a5fec28bc77d33355cdba735418484.1748365640.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0c5b7d8f8d67..21c816c3bfe0 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -366,7 +366,8 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { - io_zcrx_unmap_area(area->ifq, area); + if (area->ifq) + io_zcrx_unmap_area(area->ifq, area); io_release_area_mem(&area->mem); kvfree(area->freelist); -- cgit v1.2.3 From 2c7f023219966777be0687e15b57689894304cd3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2025 13:45:44 -0600 Subject: io_uring/net: only consider msg_inq if larger than 1 Currently retry and general validity of msg_inq is gated on it being larger than zero, but it's entirely possible for this to be slightly inaccurate. In particular, if FIN is received, it'll return 1. Just use larger than 1 as the check. This covers both the FIN case, and at the same time, it doesn't make much sense to retry a recv immediately if there's even just a single 1 byte of valid data in the socket. Leave the SOCK_NONEMPTY flagging when larger than 0 still, as an app may use that for the final receive. Cc: stable@vger.kernel.org Reported-by: Christian Mazakas Fixes: 7c71a0af81ba ("io_uring/net: improve recv bundles") Signed-off-by: Jens Axboe --- io_uring/net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index d13f3e8f6c72..e16633fd6630 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -832,7 +832,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 && + if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; @@ -1070,7 +1070,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 0) + if (kmsg->msg.msg_inq > 1) arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); -- cgit v1.2.3 From e931d3a9d5200bae9d938be2582072b2898e37f7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 23 May 2025 23:37:39 +0100 Subject: MAINTAINERS: remove myself from io_uring Disassociate my name from the project over disagreements on development practices. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/814ec73b73323a8e1c87643d193a73f467fb191f.1748034476.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e45559690b28..cd6ef0839df6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12492,7 +12492,6 @@ F: include/linux/iosys-map.h IO_URING M: Jens Axboe -M: Pavel Begunkov L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block -- cgit v1.2.3 From 607d09d1a01e9f29e91733e3a08b63ed240aacb2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Jun 2025 07:42:28 -0600 Subject: io_uring/kbuf: limit legacy provided buffer lists to USHRT_MAX The buffer ID for a provided buffer is an unsigned short, and hence there can only be 64k added to any given buffer list before having duplicate BIDs. Cap the legacy provided buffers at 64k in the list. This is mostly to prevent silly stall reports from syzbot, which likes to dump tons of buffers into a list and then have kernels with lockdep and kasan churning through them and hitting long wait times for buffer pruning at ring exit time. Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 17 +++++++++++++++-- io_uring/kbuf.h | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8cce3ebd813f..2ea65f3cef72 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -108,6 +108,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); + bl->nbufs++; req->flags &= ~REQ_F_BUFFER_SELECTED; io_ring_submit_unlock(ctx, issue_flags); @@ -122,6 +123,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); + bl->nbufs--; if (*len == 0 || *len > kbuf->len) *len = kbuf->len; if (list_empty(&bl->buf_list)) @@ -390,6 +392,7 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); + bl->nbufs--; kfree(nxt); cond_resched(); } @@ -491,14 +494,24 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, { struct io_buffer *buf; u64 addr = pbuf->addr; - int i, bid = pbuf->bid; + int ret = -ENOMEM, i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { + /* + * Nonsensical to have more than sizeof(bid) buffers in a + * buffer list, as the application then has no way of knowing + * which duplicate bid refers to what buffer. + */ + if (bl->nbufs == USHRT_MAX) { + ret = -EOVERFLOW; + break; + } buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) break; list_add_tail(&buf->list, &bl->buf_list); + bl->nbufs++; buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -508,7 +521,7 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, cond_resched(); } - return i ? 0 : -ENOMEM; + return i ? 0 : ret; } static int __io_manage_buffers_legacy(struct io_kiocb *req, diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 4d2c209d1a41..5d83c7adc739 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -21,6 +21,9 @@ struct io_buffer_list { struct list_head buf_list; struct io_uring_buf_ring *buf_ring; }; + /* count of classic/legacy buffers in buffer list */ + int nbufs; + __u16 bgid; /* below is for ring provided buffers */ -- cgit v1.2.3 From 6a8118a77eec5fc4dfec69cc6bdc52229943f6ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jun 2025 08:49:32 -0600 Subject: io_uring/futex: get rid of struct io_futex addr union Rather than use a union of a u32 and struct futex_waitv user address, consolidate it into a single void __user pointer instead. This also makes prep easier to use as the store happens to the member that will be used. No functional changes in this patch. Signed-off-by: Jens Axboe --- io_uring/futex.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index fa374afbaa51..5a3991b0d1a7 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -14,10 +14,7 @@ struct io_futex { struct file *file; - union { - u32 __user *uaddr; - struct futex_waitv __user *uwaitv; - }; + void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; unsigned long futexv_owned; @@ -186,7 +183,7 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!futexv) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, + ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { kfree(futexv); -- cgit v1.2.3 From 079afb081c4288e94d5e4223d3eb6306d853c68b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Jun 2025 10:25:42 -0600 Subject: io_uring/futex: mark wait requests as inflight Inflight marking is used so that do_exit() -> io_uring_files_cancel() will find requests with files that reference an io_uring instance, so they can get appropriately canceled before the files go away. However, it's also called before the mm goes away. Mark futex/futexv wait requests as being inflight, so that io_uring_files_cancel() will prune them. This ensures that the mm stays alive, which is important as an exiting mm will also free the futex private hash buckets. An io_uring futex request with FUTEX2_PRIVATE set relies on those being alive until the request has completed. A recent commit added these futex private hashes, which get killed when the mm goes away. Fixes: 80367ad01d93 ("futex: Add basic infrastructure for local task local hash") Link: https://lore.kernel.org/io-uring/38053.1749045482@localhost/ Reported-by: Robert Morris Signed-off-by: Jens Axboe --- io_uring/futex.c | 4 ++++ io_uring/io_uring.c | 7 ++++++- io_uring/io_uring.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 5a3991b0d1a7..692462d50c8c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -145,6 +145,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); return 0; } @@ -190,6 +192,8 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c7a9cecf528e..cf759c172083 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -408,7 +408,12 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 0ea7a435d1de..d59c12277d58 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -83,6 +83,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -- cgit v1.2.3