summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/ringbuffer.c407
-rw-r--r--include/linux/fs.h12
6 files changed, 429 insertions, 1 deletions
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7fd1f57ad3d3..2385359eaf75 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -467,3 +467,6 @@
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
461 i386 lsm_list_modules sys_lsm_list_modules
462 i386 mseal sys_mseal
+463 i386 ringbuffer sys_ringbuffer
+464 i386 ringbuffer_wait sys_ringbuffer_wait
+465 i386 ringbuffer_wakeup sys_ringbuffer_wakeup
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index a396f6e6ab5b..942602ece075 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -384,6 +384,9 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common ringbuffer sys_ringbuffer
+464 common ringbuffer_wait sys_ringbuffer_wait
+465 common ringbuffer_wakeup sys_ringbuffer_wakeup
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/Makefile b/fs/Makefile
index 6ecc9b0a53f2..9b9e990cf64c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,8 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+ ringbuffer.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
diff --git a/fs/file_table.c b/fs/file_table.c
index 4f03beed4737..56e29249fb57 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
+#include <linux/ringbuffer_sys.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>
@@ -412,6 +413,7 @@ static void __fput(struct file *file)
*/
eventpoll_release(file);
locks_remove_file(file);
+ ringbuffer_release(file);
security_file_release(file);
if (unlikely(file->f_flags & FASYNC)) {
diff --git a/fs/ringbuffer.c b/fs/ringbuffer.c
new file mode 100644
index 000000000000..7325a117befd
--- /dev/null
+++ b/fs/ringbuffer.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/pseudo_fs.h>
+#include <linux/ringbuffer_sys.h>
+#include <linux/syscalls.h>
+
+#define RINGBUFFER_FS_MAGIC 0xa10a10a2
+#define RINGBUFFER_INITIALIZING ((void *) 1)
+
+static struct vfsmount *ringbuffer_mnt;
+
+/*
+ * ringbuffer_ptrs - head and tail pointers for a ringbuffer, mappped to
+ * userspace:
+ */
+struct ringbuffer_ptrs {
+ /*
+ * We use u32s because this type is shared between the kernel and
+ * userspace - ulong/size_t won't work here, we might be 32bit userland
+ * and 64 bit kernel, and u64 would be preferable (reduced probability
+ * of ABA) but not all architectures can atomically read/write to a u64;
+ * we need to avoid torn reads/writes.
+ *
+ * head and tail pointers are incremented and stored without masking;
+ * this is to avoid ABA and differentiate between a full and empty
+ * buffer - they must be masked with @mask to get an actual offset into
+ * the data buffer.
+ *
+ * All units are in bytes.
+ *
+ * Data is emitted at head, consumed from tail.
+ */
+ u32 head;
+ u32 tail;
+ u32 size; /* always a power of two */
+ u32 mask; /* size - 1 */
+
+ /*
+ * Starting offset of data buffer, from the start of this struct - will
+ * always be PAGE_SIZE.
+ */
+ u32 data_offset;
+};
+
+struct ringbuffer {
+ wait_queue_head_t wait[2];
+ spinlock_t lock;
+ int rw;
+ u32 size; /* always a power of two */
+ u32 mask; /* size - 1 */
+ struct file *io_file;
+ /* hidden internal file for the mmap */
+ struct file *rb_file;
+ struct ringbuffer_ptrs *ptrs;
+ void *data;
+ ulong user_addr;
+};
+
+static const struct address_space_operations ringbuffer_aops = {
+ .dirty_folio = noop_dirty_folio,
+#if 0
+ .migrate_folio = ringbuffer_migrate_folio,
+#endif
+};
+
+#if 0
+static int ringbuffer_mremap(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
+ struct kioctx_table *table;
+ int i, res = -EINVAL;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+ if (!table)
+ goto out_unlock;
+
+ for (i = 0; i < table->nr; i++) {
+ struct kioctx *ctx;
+
+ ctx = rcu_dereference(table->table[i]);
+ if (ctx && ctx->ringbuffer_file == file) {
+ if (!atomic_read(&ctx->dead)) {
+ ctx->user_id = ctx->mmap_base = vma->vm_start;
+ res = 0;
+ }
+ break;
+ }
+ }
+
+out_unlock:
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+ return res;
+}
+#endif
+
+static const struct vm_operations_struct ringbuffer_vm_ops = {
+#if 0
+ .mremap = ringbuffer_mremap,
+#endif
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int ringbuffer_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vm_flags_set(vma, VM_DONTEXPAND);
+ vma->vm_ops = &ringbuffer_vm_ops;
+ return 0;
+}
+
+static const struct file_operations ringbuffer_fops = {
+ .mmap = ringbuffer_mmap,
+};
+
+static struct ringbuffer *ringbuffer_alloc(struct file *file, int rw, u32 size)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned order = get_order(size);
+ size = PAGE_SIZE << order;
+
+ struct ringbuffer *rb = kzalloc(sizeof(*rb), GFP_KERNEL);
+ if (!rb)
+ return ERR_PTR(-ENOMEM);
+
+ init_waitqueue_head(&rb->wait[READ]);
+ init_waitqueue_head(&rb->wait[WRITE]);
+ spin_lock_init(&rb->lock);
+ rb->rw = rw;
+ rb->size = size;
+ rb->mask = size - 1;
+ rb->io_file = file;
+
+ rb->ptrs = (void *) __get_free_page(GFP_KERNEL|__GFP_ZERO);
+ rb->data = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ if (!rb->ptrs || !rb->data)
+ goto err;
+
+ rb->ptrs->size = size;
+ rb->ptrs->mask = size - 1;
+ rb->ptrs->data_offset = PAGE_SIZE;
+
+ struct inode *inode = alloc_anon_inode(ringbuffer_mnt->mnt_sb);
+ int ret = PTR_ERR_OR_ZERO(inode);
+ if (ret)
+ goto err;
+
+ inode->i_mapping->a_ops = &ringbuffer_aops;
+ inode->i_mapping->i_private_data = rb;
+ inode->i_size = size;
+
+ rb->rb_file = alloc_file_pseudo(inode, ringbuffer_mnt, "[ringbuffer]",
+ O_RDWR, &ringbuffer_fops);
+ ret = PTR_ERR_OR_ZERO(rb->rb_file);
+ if (ret)
+ goto err;
+
+ ret = filemap_add_folio(rb->rb_file->f_mapping,
+ page_folio(virt_to_page(rb->ptrs)),
+ 0, GFP_KERNEL);
+ if (ret)
+ goto err;
+
+ ret = filemap_add_folio(rb->rb_file->f_mapping,
+ page_folio(virt_to_page(rb->data)),
+ 1, GFP_KERNEL);
+ if (ret)
+ goto err;
+
+ ret = mmap_write_lock_killable(mm);
+ if (ret)
+ goto err;
+
+ unsigned long unused;
+ rb->user_addr = do_mmap(rb->rb_file, 0, size + PAGE_SIZE,
+ PROT_READ|PROT_WRITE,
+ MAP_SHARED, 0, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ ret = PTR_ERR_OR_ZERO((void *) rb->user_addr);
+ if (ret)
+ goto err;
+
+ file->ringbuffer[rw] = rb;
+ return rb;
+err:
+ if (!IS_ERR(inode))
+ iput(inode);
+ free_pages((ulong) rb->data, order);
+ free_page((ulong) rb->ptrs);
+ kfree(rb);
+ return ERR_PTR(ret);
+}
+
+/* file is going away, tear down ringbuffers: */
+void ringbuffer_release(struct file *file)
+{
+}
+
+SYSCALL_DEFINE4(ringbuffer, unsigned, fd, int, rw, u32, size, ulong __user *, ringbufferp)
+{
+ unsigned long ringbuffer;
+
+ int ret = get_user(ringbuffer, ringbufferp);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(ringbuffer || !size || rw > WRITE))
+ return -EINVAL;
+
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ if (!(f.file->f_op->fop_flags & (rw == READ ? FOP_RINGBUFFER_READ : FOP_RINGBUFFER_WRITE))) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ /* ringbuffer ptr entry serves as a lock while it's being initialized */
+ if (f.file->ringbuffer[rw] ||
+ cmpxchg(&f.file->ringbuffer[rw], NULL, RINGBUFFER_INITIALIZING)) {
+ ret = -EEXIST;
+ goto err;
+ }
+
+ struct ringbuffer *rb = ringbuffer_alloc(f.file, rw, size);
+ ret = PTR_ERR_OR_ZERO(rb);
+ if (ret)
+ goto err_uninit;
+
+ ret = put_user(rb->user_addr, ringbufferp);
+ if (ret) {
+ BUG();
+ goto err_uninit;
+ }
+err:
+ fdput(f);
+ return ret;
+err_uninit:
+ f.file->ringbuffer[rw] = NULL;
+ goto err;
+}
+
+static bool __ringbuffer_read(struct ringbuffer *rb, void **data, size_t *len,
+ bool nonblocking, size_t *ret)
+{
+ u32 head = rb->ptrs->head;
+ u32 tail = rb->ptrs->tail;
+
+ if (head == tail)
+ return 0;
+
+ ulong flags;
+ spin_lock_irqsave(&rb->lock, flags);
+ /* Multiple consumers - recheck under lock: */
+ tail = rb->ptrs->tail;
+
+ while (*len && tail != head) {
+ u32 tail_masked = tail & rb->mask;
+ u32 b = min(*len,
+ min(head - tail,
+ rb->size - tail_masked));
+
+ memcpy(*data, rb->data + tail_masked, b);
+ tail += b;
+ *data += b;
+ *len -= b;
+ *ret += b;
+ }
+
+ smp_store_release(&rb->ptrs->tail, tail);
+ spin_unlock_irqrestore(&rb->lock, flags);
+
+ return !*len || nonblocking;
+}
+
+size_t ringbuffer_read(struct ringbuffer *rb, void *data, size_t len, bool nonblocking)
+{
+ size_t ret = 0;
+ wait_event(rb->wait[READ], __ringbuffer_read(rb, &data, &len, nonblocking, &ret));
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ringbuffer_read);
+
+static bool __ringbuffer_write(struct ringbuffer *rb, void **data, size_t *len,
+ bool nonblocking, size_t *ret)
+{
+ u32 head = rb->ptrs->head;
+ u32 tail = rb->ptrs->tail;
+
+ if (head - tail >= rb->size)
+ return 0;
+
+ ulong flags;
+ spin_lock_irqsave(&rb->lock, flags);
+ /* Multiple producers - recheck under lock: */
+ head = rb->ptrs->head;
+
+ while (*len && head - tail < rb->size) {
+ u32 head_masked = head & rb->mask;
+ u32 b = min(*len,
+ min(tail + rb->size - head,
+ rb->size - head_masked));
+
+ memcpy(rb->data + head_masked, *data, b);
+ head += b;
+ *data += b;
+ *len -= b;
+ *ret += b;
+ }
+
+ smp_store_release(&rb->ptrs->head, head);
+ spin_unlock_irqrestore(&rb->lock, flags);
+
+ return !*len || nonblocking;
+}
+
+size_t ringbuffer_write(struct ringbuffer *rb, void *data, size_t len, bool nonblocking)
+{
+ size_t ret = 0;
+ wait_event(rb->wait[WRITE], __ringbuffer_write(rb, &data, &len, nonblocking, &ret));
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ringbuffer_write);
+
+SYSCALL_DEFINE2(ringbuffer_wait, unsigned, fd, int, rw)
+{
+ int ret = 0;
+
+ if (rw > WRITE)
+ return -EINVAL;
+
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ struct ringbuffer *rb = f.file->ringbuffer[rw];
+ if (!rb || rb == RINGBUFFER_INITIALIZING) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ struct ringbuffer_ptrs *rp = rb->ptrs;
+ wait_event(rb->wait[rw], rw == READ
+ ? rp->head != rp->tail
+ : rp->head - rp->tail < rb->size);
+err:
+ fdput(f);
+ return ret;
+}
+
+SYSCALL_DEFINE2(ringbuffer_wakeup, unsigned, fd, int, rw)
+{
+ int ret = 0;
+
+ if (rw > WRITE)
+ return -EINVAL;
+
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ struct ringbuffer *rb = f.file->ringbuffer[rw];
+ if (!rb || rb == RINGBUFFER_INITIALIZING) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ wake_up(&rb->wait[rw]);
+err:
+ fdput(f);
+ return ret;
+}
+
+static int ringbuffer_init_fs_context(struct fs_context *fc)
+{
+ if (!init_pseudo(fc, RINGBUFFER_FS_MAGIC))
+ return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ return 0;
+}
+
+static int __init ringbuffer_setup(void)
+{
+ static struct file_system_type ringbuffer_fs = {
+ .name = "ringbuffer",
+ .init_fs_context = ringbuffer_init_fs_context,
+ .kill_sb = kill_anon_super,
+ };
+ ringbuffer_mnt = kern_mount(&ringbuffer_fs);
+ if (IS_ERR(ringbuffer_mnt))
+ panic("Failed to create ringbuffer fs mount.");
+ return 0;
+}
+__initcall(ringbuffer_setup);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0283cf366c2a..c05be43c1f65 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -978,6 +978,8 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
index < ra->start + ra->size);
}
+struct ringbuffer;
+
/*
* f_{lock,count,pos_lock} members can be highly contended and share
* the same cacheline. f_{lock,mode} are very frequently used together
@@ -1024,6 +1026,12 @@ struct file {
struct address_space *f_mapping;
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
+
+ /*
+ * Ringbuffers for reading/writing without syncall overhead, created by
+ * ringbuffer(2)
+ */
+ struct ringbuffer *ringbuffer[2];
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@@ -2051,6 +2059,10 @@ struct file_operations {
#define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
+/* Supports read ringbuffers */
+#define FOP_RINGBUFFER_READ ((__force fop_flags_t)(1 << 5))
+/* Supports write ringbuffers */
+#define FOP_RINGBUFFER_WRITE ((__force fop_flags_t)(1 << 6))
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,