diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-02-04 18:36:48 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:37:37 -0900 |
commit | 3ae36b2b8294a034d1b0656ea2835ccdd4e6a797 (patch) | |
tree | cbd6b585606a220c384900eafc84a57931e9c89a | |
parent | e3f87bfdb2b1dffc4c38f77d8490e764c2d65a06 (diff) |
bcachefs: split out fs-io.c
-rw-r--r-- | drivers/md/bcache/Makefile | 7 | ||||
-rw-r--r-- | drivers/md/bcache/fs-io.c | 1834 | ||||
-rw-r--r-- | drivers/md/bcache/fs-io.h | 77 | ||||
-rw-r--r-- | drivers/md/bcache/fs.c | 1899 | ||||
-rw-r--r-- | drivers/md/bcache/fs.h | 12 |
5 files changed, 1941 insertions, 1888 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index aa67393cd0a8..70119335e649 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\ bset.o btree_cache.o btree_gc.o btree_io.o btree_iter.o btree_update.o\ buckets.o chardev.o clock.o closure.o debug.o dirent.o error.o\ - extents.o fs.o fs-gc.o inode.o io.o journal.o keybuf.o keylist.o\ - migrate.o move.o movinggc.o notify.o opts.o request.o siphash.o six.o\ - stats.o super.o sysfs.o tier.o trace.o util.o writeback.o xattr.o + extents.o fs.o fs-gc.o fs-io.o inode.o io.o journal.o keybuf.o\ + keylist.o migrate.o move.o movinggc.o notify.o opts.o request.o\ + siphash.o six.o stats.o super.o sysfs.o tier.o trace.o util.o\ + writeback.o xattr.o diff --git a/drivers/md/bcache/fs-io.c b/drivers/md/bcache/fs-io.c new file mode 100644 index 000000000000..36837376524a --- /dev/null +++ b/drivers/md/bcache/fs-io.c @@ -0,0 +1,1834 @@ + +#include "bcache.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "fs.h" +#include "fs-io.h" +#include "inode.h" +#include "journal.h" +#include "io.h" +#include "keylist.h" + +#include <linux/aio.h> +#include <linux/backing-dev.h> +#include <linux/falloc.h> +#include <linux/migrate.h> +#include <linux/mmu_context.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/uio.h> +#include <linux/writeback.h> + +struct bio_set *bch_writepage_bioset; +struct bio_set *bch_dio_read_bioset; +struct bio_set *bch_dio_write_bioset; + +/* i_size updates: */ + +/* + * In memory i_size should never be < on disk i_size: + */ +static void bch_i_size_write(struct inode *inode, loff_t new_i_size) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + + EBUG_ON(new_i_size < ei->i_size); + i_size_write(inode, new_i_size); +} + +static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi, + void *p) +{ + loff_t *new_i_size = p; + unsigned i_flags = le32_to_cpu(bi->i_flags); + + lockdep_assert_held(&ei->update_lock); + + bi->i_size = cpu_to_le64(*new_i_size); + + if (atomic_long_read(&ei->i_size_dirty_count)) + i_flags |= BCH_INODE_I_SIZE_DIRTY; + else + i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + + bi->i_flags = cpu_to_le32(i_flags);; + + return 0; +} + +static int __must_check bch_write_inode_size(struct cache_set *c, + struct bch_inode_info *ei, + loff_t new_size) +{ + return __bch_write_inode(c, ei, inode_set_size, &new_size); +} + +static int inode_set_dirty(struct bch_inode_info *ei, + struct bch_inode *bi, void *p) +{ + bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)| + BCH_INODE_I_SIZE_DIRTY); + return 0; +} + +static int check_make_i_size_dirty(struct bch_inode_info *ei, loff_t offset) +{ + bool need_set_dirty; + unsigned seq; + int ret = 0; + + do { + seq = read_seqcount_begin(&ei->shadow_i_size_lock); + need_set_dirty = offset > ei->i_size && + !(ei->i_flags & BCH_INODE_I_SIZE_DIRTY); + } while (read_seqcount_retry(&ei->shadow_i_size_lock, seq)); + + if (!need_set_dirty) + return 0; + + mutex_lock(&ei->update_lock); + + /* recheck under lock.. */ + + if (offset > ei->i_size && + !(ei->i_flags & BCH_INODE_I_SIZE_DIRTY)) { + struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info; + + ret = __bch_write_inode(c, ei, inode_set_dirty, NULL); + } + + mutex_unlock(&ei->update_lock); + + return ret; +} + +static inline void i_size_dirty_put(struct bch_inode_info *ei) +{ + atomic_long_dec_bug(&ei->i_size_dirty_count); +} + +static inline void i_size_dirty_get(struct bch_inode_info *ei) +{ + lockdep_assert_held(&ei->vfs_inode.i_rwsem); + + atomic_long_inc(&ei->i_size_dirty_count); +} + +static void i_size_update_put(struct cache_set *c, struct bch_inode_info *ei, + unsigned idx, unsigned long count) +{ + struct i_size_update *u = &ei->i_size_updates.data[idx]; + loff_t new_i_size = -1; + long r; + + if (!count) + return; + + r = atomic_long_sub_return(count, &u->count); + BUG_ON(r < 0); + + if (r) + return; + + /* + * Flush i_size_updates entries in order - from the end of the fifo - + * if the entry at the end is finished (refcount has gone to 0): + */ + + mutex_lock(&ei->update_lock); + + while (!fifo_empty(&ei->i_size_updates) && + !atomic_long_read(&(u = &fifo_front(&ei->i_size_updates))->count)) { + struct i_size_update t; + + i_size_dirty_put(ei); + + if (u->new_i_size != -1) { + BUG_ON(u->new_i_size < ei->i_size); + new_i_size = u->new_i_size; + } + + fifo_pop(&ei->i_size_updates, t); + } + + if (new_i_size != -1) { + int ret = bch_write_inode_size(c, ei, new_i_size); + + ret = ret; + /* + * XXX: need to pin the inode in memory if the inode update + * fails + */ + } + + mutex_unlock(&ei->update_lock); +} + +static struct i_size_update *i_size_update_new(struct bch_inode_info *ei, + loff_t new_size) +{ + struct i_size_update *u; + + lockdep_assert_held(&ei->update_lock); + + if (fifo_empty(&ei->i_size_updates) || + (test_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags) && + !fifo_full(&ei->i_size_updates))) { + clear_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); + fifo_push(&ei->i_size_updates, + (struct i_size_update) { 0 }); + + u = &fifo_back(&ei->i_size_updates); + atomic_long_set(&u->count, 0); + i_size_dirty_get(ei); + } + + u = &fifo_back(&ei->i_size_updates); + u->new_i_size = new_size; + + return u; +} + +/* page state: */ + +/* stored in page->private: */ +struct bch_page_state { + u8 idx; +}; + +#define SECTORS_CACHE 1024 + +static int reserve_sectors(struct cache_set *c, unsigned sectors) +{ + u64 sectors_to_get = SECTORS_CACHE + sectors; + + if (likely(atomic64_sub_return(sectors, + &c->sectors_reserved_cache) >= 0)) + return 0; + + atomic64_add(sectors_to_get, &c->sectors_reserved); + + if (likely(!cache_set_full(c))) { + atomic64_add(sectors_to_get, &c->sectors_reserved_cache); + return 0; + } + + atomic64_sub_bug(sectors_to_get, &c->sectors_reserved); + atomic64_add(sectors, &c->sectors_reserved_cache); + return -ENOSPC; +} + +/* + * our page flags: + * + * allocated - page has space on disk reserved for it (c->sectors_reserved) - + * -ENOSPC was checked then, shouldn't be checked later + * + * append - page is dirty from an append write, new i_size can't be written + * until after page is written; ref held on ei->i_size_dirty_count + */ + +#define PF_ANY(page, enforce) page +PAGEFLAG(Allocated, private, PF_ANY) +TESTSCFLAG(Allocated, private, PF_ANY) + +PAGEFLAG(Append, private_2, PF_ANY) +TESTSCFLAG(Append, private_2, PF_ANY) +#undef PF_ANY + +static void bch_clear_page_bits(struct cache_set *c, struct bch_inode_info *ei, + struct page *page) +{ + EBUG_ON(!PageLocked(page)); + + if (PageAllocated(page)) { + atomic64_sub_bug(PAGE_SECTORS, &c->sectors_reserved); + ClearPageAllocated(page); + } + + if (PageAppend(page)) { + struct bch_page_state *s = (void *) &page->private; + + i_size_update_put(c, ei, s->idx, 1); + ClearPageAppend(page); + } +} + +/* readpages/writepages: */ + +static int bch_bio_add_page(struct bio *bio, struct page *page) +{ + sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + + BUG_ON(!bio->bi_max_vecs); + + if (!bio->bi_vcnt) + bio->bi_iter.bi_sector = offset; + else if (bio_end_sector(bio) != offset || + bio->bi_vcnt == bio->bi_max_vecs) + return -1; + + bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + + bio->bi_iter.bi_size += PAGE_SIZE; + + return 0; +} + +static void bch_readpages_end_io(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!bio->bi_error) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +static inline struct page *__readpage_next_page(struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct page *page; + int ret; + + while (*nr_pages) { + page = list_entry(pages->prev, struct page, lru); + prefetchw(&page->flags); + list_del(&page->lru); + + ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS); + + /* if add_to_page_cache_lru() succeeded, page is locked: */ + put_page(page); + + if (!ret) + return page; + + (*nr_pages)--; + } + + return NULL; +} + +#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page) \ + for (; \ + ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\ + (_nr_pages)--) + +int bch_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio = NULL; + struct page *page; + + pr_debug("reading %u pages", nr_pages); + + for_each_readpage_page(mapping, pages, nr_pages, page) { +again: + if (!bio) { + bio = bio_alloc(GFP_NOFS, + min_t(unsigned, nr_pages, + BIO_MAX_PAGES)); + + bio->bi_end_io = bch_readpages_end_io; + } + + if (bch_bio_add_page(bio, page)) { + bch_read(c, bio, inode->i_ino); + bio = NULL; + goto again; + } + } + + if (bio) + bch_read(c, bio, inode->i_ino); + + pr_debug("success"); + return 0; +} + +int bch_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio; + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_end_io = bch_readpages_end_io; + + bch_bio_add_page(bio, page); + bch_read(c, bio, inode->i_ino); + + return 0; +} + +struct bch_writepage { + struct cache_set *c; + u64 inum; + struct bch_writepage_io *io; +}; + +static void bch_writepage_io_free(struct closure *cl) +{ + struct bch_writepage_io *io = container_of(cl, + struct bch_writepage_io, cl); + struct bio *bio = &io->bio.bio.bio; + + bio_put(bio); +} + +static void bch_writepage_io_done(struct closure *cl) +{ + struct bch_writepage_io *io = container_of(cl, + struct bch_writepage_io, cl); + struct cache_set *c = io->op.c; + struct bio *bio = &io->bio.bio.bio; + struct bch_inode_info *ei = io->ei; + struct bio_vec *bvec; + unsigned i; + + atomic64_sub_bug(io->sectors_reserved, &c->sectors_reserved); + + for (i = 0; i < ARRAY_SIZE(io->i_size_update_count); i++) + i_size_update_put(c, ei, i, io->i_size_update_count[i]); + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + BUG_ON(!PageWriteback(page)); + + if (io->bio.bio.bio.bi_error) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + + end_page_writeback(page); + } + + closure_return_with_destructor(&io->cl, bch_writepage_io_free); +} + +static void bch_writepage_do_io(struct bch_writepage_io *io) +{ + pr_debug("writing %u sectors to %llu:%llu", + bio_sectors(&io->bio.bio.bio), + io->op.insert_key.k.p.inode, + (u64) io->bio.bio.bio.bi_iter.bi_sector); + + closure_call(&io->op.cl, bch_write, NULL, &io->cl); + continue_at(&io->cl, bch_writepage_io_done, io->op.c->wq); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch_writepage_io_alloc(struct bch_writepage *w, + struct bch_inode_info *ei, + struct page *page) +{ +alloc_io: + if (!w->io) { + struct bio *bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, + bch_writepage_bioset); + w->io = container_of(bio, struct bch_writepage_io, bio.bio.bio); + + closure_init(&w->io->cl, NULL); + w->io->ei = ei; + memset(w->io->i_size_update_count, 0, + sizeof(w->io->i_size_update_count)); + w->io->sectors_reserved = 0; + + bch_write_op_init(&w->io->op, w->c, &w->io->bio, NULL, + bkey_to_s_c(&KEY(w->inum, 0, 0)), + NULL, + &ei->journal_seq, 0); + } + + if (bch_bio_add_page(&w->io->bio.bio.bio, page)) { + bch_writepage_do_io(w->io); + w->io = NULL; + goto alloc_io; + } + + /* + * We shouldn't ever be handed pages for multiple inodes in a single + * pass - right? + */ + BUG_ON(ei != w->io->ei); +} + +static int __bch_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_writepage *w = data; + unsigned offset; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_SHIFT; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto do_io; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_SIZE - 1); + if (page->index > end_index || !offset) { + unlock_page(page); + return 0; + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_SIZE); +do_io: + if (check_make_i_size_dirty(ei, page_offset(page) + PAGE_SIZE)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + bch_writepage_io_alloc(w, ei, page); + + /* + * Before unlocking the page, transfer refcounts to w->io: + */ + if (PageAppend(page)) { + struct bch_page_state *s = (void *) &page->private; + + /* + * i_size won't get updated and this write's data made visible + * until the i_size_update this page points to completes - so + * tell the write path to start a new one: + */ + if (&ei->i_size_updates.data[s->idx] == + &fifo_back(&ei->i_size_updates)) + set_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); + + w->io->i_size_update_count[s->idx]++; + ClearPageAppend(page); + } + + if (PageAllocated(page)) { + w->io->sectors_reserved += PAGE_SECTORS; + ClearPageAllocated(page); + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + return 0; +} + +int bch_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + int ret; + struct bch_writepage w = { + .c = mapping->host->i_sb->s_fs_info, + .inum = mapping->host->i_ino, + .io = NULL, + }; + + ret = write_cache_pages(mapping, wbc, __bch_writepage, &w); + + if (w.io) + bch_writepage_do_io(w.io); + + return ret; +} + +int bch_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int ret; + struct bch_writepage w = { + .c = inode->i_sb->s_fs_info, + .inum = inode->i_ino, + .io = NULL, + }; + + ret = __bch_writepage(page, NULL, &w); + if (ret) + return ret; + + if (w.io) + bch_writepage_do_io(w.io); + + return 0; +} + +static void bch_read_single_page_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int bch_read_single_page(struct page *page, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio; + int ret = 0; + DECLARE_COMPLETION_ONSTACK(done); + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_private = &done; + bio->bi_end_io = bch_read_single_page_end_io; + bch_bio_add_page(bio, page); + + bch_read(c, bio, inode->i_ino); + wait_for_completion(&done); + + if (!ret) + ret = bio->bi_error; + bio_put(bio); + + if (ret < 0) + return ret; + + SetPageUptodate(page); + + return 0; +} + +int bch_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_SHIFT; + unsigned offset = pos & (PAGE_SIZE - 1); + struct page *page; + int ret = 0; + + BUG_ON(inode_unhashed(mapping->host)); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + if (!PageAllocated(page)) { + if (reserve_sectors(c, PAGE_SECTORS)) { + ret = -ENOSPC; + goto err; + } + + SetPageAllocated(page); + } + + if (PageUptodate(page)) + goto out; + + /* If we're writing entire page, don't need to read it in first: */ + if (len == PAGE_SIZE) + goto out; + + if (!offset && pos + len >= inode->i_size) { + zero_user_segment(page, len, PAGE_SIZE); + flush_dcache_page(page); + goto out; + } + + if (index > inode->i_size >> PAGE_SHIFT) { + zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); + flush_dcache_page(page); + goto out; + } + + ret = bch_read_single_page(page, mapping); + if (ret) + goto err; +out: + *pagep = page; + return ret; +err: + unlock_page(page); + put_page(page); + page = NULL; + goto out; +} + +int bch_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + + lockdep_assert_held(&inode->i_rwsem); + + if (unlikely(copied < len && !PageUptodate(page))) { + /* + * The page needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + * + * userspace doesn't _have_ to redo the write, so clear + * PageAllocated: + */ + copied = 0; + zero_user(page, 0, PAGE_SIZE); + flush_dcache_page(page); + bch_clear_page_bits(c, ei, page); + goto out; + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + if (!PageDirty(page)) + set_page_dirty(page); + + if (pos + copied > inode->i_size) { + struct i_size_update *u; + + /* + * if page already has a ref on a i_size_update, even if it's an + * older one, leave it - they have to be flushed in order so + * that's just as good as taking a ref on a newer one, if we're + * adding a newer one now + * + * - if there's no current i_size_update, or if we want to + * create a new one and there's room for a new one, create it + * + * - set current i_size_update's i_size to new i_size + * + * - if !PageAppend, take a ref on the current i_size_update + */ + + /* XXX: locking */ + mutex_lock(&ei->update_lock); + u = i_size_update_new(ei, pos + copied); + + if (!PageAppend(page)) { + struct bch_page_state *s = (void *) &page->private; + + s->idx = u - ei->i_size_updates.data; + atomic_long_inc(&u->count); + + SetPageAppend(page); + } + + bch_i_size_write(inode, pos + copied); + mutex_unlock(&ei->update_lock); + } +out: + unlock_page(page); + put_page(page); + + return copied; +} + +/* O_DIRECT */ + +static void bch_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret, 0); + bio_put(&dio->bio); +} + +static void bch_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_error) + dio->ret = bio->bi_error; + + closure_put(&dio->cl); + bio_check_pages_dirty(bio); /* transfers ownership */ +} + +static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct dio_read *dio; + struct bio *bio; + unsigned long inum = inode->i_ino; + ssize_t ret = 0; + size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); + bool sync = is_sync_kiocb(req); + loff_t i_size; + + bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_read_bioset); + bio_get(bio); + + dio = container_of(bio, struct dio_read, bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + } + + dio->req = req; + dio->ret = iter->count; + + i_size = i_size_read(inode); + if (offset + dio->ret > i_size) { + dio->ret = max_t(loff_t, 0, i_size - offset); + iter->count = round_up(dio->ret, PAGE_SIZE); + } + + if (!dio->ret) { + closure_put(&dio->cl); + goto out; + } + + goto start; + while (iter->count) { + pages = iov_iter_npages(iter, BIO_MAX_PAGES); + bio = bio_alloc(GFP_KERNEL, pages); +start: + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_end_io = bch_direct_IO_read_endio; + bio->bi_private = dio; + + ret = bio_get_user_pages(bio, iter, 1); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_error = ret; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch_read(c, bio, inum); + } +out: + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_put(&dio->bio); + return ret; + } else { + return -EIOCBQUEUED; + } +} + +static void __bch_dio_write_complete(struct dio_write *dio) +{ + inode_dio_end(dio->req->ki_filp->f_inode); + + if (dio->iovec && dio->iovec != dio->inline_vecs) + kfree(dio->iovec); + + bio_put(&dio->bio.bio.bio); +} + +static void bch_dio_write_complete(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, cl); + struct kiocb *req = dio->req; + long ret = dio->written ?: dio->error; + + __bch_dio_write_complete(dio); + req->ki_complete(req, ret, 0); +} + +static void bch_dio_write_done(struct dio_write *dio) +{ + struct bio_vec *bv; + int i; + + dio->written += dio->iop.written << 9; + + if (dio->iop.error) + dio->error = dio->iop.error; + + bio_for_each_segment_all(bv, &dio->bio.bio.bio, i) + put_page(bv->bv_page); + + if (dio->iter.count) + bio_reset(&dio->bio.bio.bio); +} + +static void bch_do_direct_IO_write(struct dio_write *dio, bool sync) +{ + struct file *file = dio->req->ki_filp; + struct inode *inode = file->f_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio = &dio->bio.bio.bio; + unsigned flags = BCH_WRITE_CHECK_ENOSPC; + int ret; + + if (file->f_flags & O_DSYNC || IS_SYNC(file->f_mapping->host)) + flags |= BCH_WRITE_FLUSH; + + while (dio->iter.count) { + bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; + + ret = bio_get_user_pages(bio, &dio->iter, 0); + if (ret < 0) { + dio->error = ret; + break; + } + + bch_write_op_init(&dio->iop, c, &dio->bio, NULL, + bkey_to_s_c(&KEY(inode->i_ino, + bio_end_sector(bio), + bio_sectors(bio))), + NULL, + &ei->journal_seq, flags); + + task_io_account_write(bio->bi_iter.bi_size); + + closure_call(&dio->iop.cl, bch_write, NULL, &dio->cl); + + if (!sync) + break; + + closure_sync(&dio->cl); + bch_dio_write_done(dio); + } +} + +static void bch_dio_write_loop_async(struct closure *cl) +{ + struct dio_write *dio = + container_of(cl, struct dio_write, cl); + + bch_dio_write_done(dio); + + if (dio->iter.count && !dio->error) { + use_mm(dio->mm); + bch_do_direct_IO_write(dio, false); + unuse_mm(dio->mm); + + continue_at(&dio->cl, + bch_dio_write_loop_async, + dio->iter.count ? system_wq : NULL); + } else { +#if 0 + closure_return_with_destructor(cl, bch_dio_write_complete); +#else + closure_debug_destroy(cl); + bch_dio_write_complete(cl); +#endif + } +} + +static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct dio_write *dio; + struct bio *bio; + size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); + ssize_t ret; + bool sync; + + lockdep_assert_held(&inode->i_rwsem); + + bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_write_bioset); + + dio = container_of(bio, struct dio_write, bio.bio.bio); + dio->req = req; + dio->written = 0; + dio->error = 0; + dio->offset = offset; + dio->append = false; + dio->iovec = NULL; + dio->iter = *iter; + dio->mm = current->mm; + + if (offset + iter->count > inode->i_size) { + /* + * XXX: try and convert this to i_size_update_new(), and maybe + * make async O_DIRECT appends work + */ + + dio->append = true; + i_size_dirty_get(ei); + } + + ret = check_make_i_size_dirty(ei, offset + iter->count); + if (ret) { + if (dio->append) + i_size_dirty_put(ei); + bio_put(bio); + return ret; + } + + closure_init(&dio->cl, NULL); + + inode_dio_begin(inode); + + /* + * appends are sync in order to do the i_size update under + * i_rwsem, after we know the write has completed successfully + */ + sync = is_sync_kiocb(req) || dio->append; + + bch_do_direct_IO_write(dio, sync); + + if (sync) { + closure_debug_destroy(&dio->cl); + ret = dio->written ?: dio->error; + + if (dio->append) { + loff_t new_i_size = offset + dio->written; + int ret2 = 0; + + if (dio->written && + new_i_size > inode->i_size) { + struct i_size_update *u; + unsigned idx; + + mutex_lock(&ei->update_lock); + + bch_i_size_write(inode, new_i_size); + + fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) { + if (u->new_i_size < new_i_size) + u->new_i_size = -1; + else + BUG(); + } + + i_size_dirty_put(ei); + ret2 = bch_write_inode_size(c, ei, new_i_size); + + mutex_unlock(&ei->update_lock); + } else { + i_size_dirty_put(ei); + } + } + + __bch_dio_write_complete(dio); + return ret; + } else { + if (dio->iter.count) { + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + dio->iovec = kmalloc(dio->iter.nr_segs * + sizeof(struct iovec), + GFP_KERNEL); + if (!dio->iovec) + dio->error = -ENOMEM; + } else { + dio->iovec = dio->inline_vecs; + } + + memcpy(dio->iovec, + dio->iter.iov, + dio->iter.nr_segs * sizeof(struct iovec)); + dio->iter.iov = dio->iovec; + } + + continue_at_noreturn(&dio->cl, + bch_dio_write_loop_async, + dio->iter.count ? system_wq : NULL); + return -EIOCBQUEUED; + } +} + +ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct inode *inode = file->f_inode; + struct cache_set *c = inode->i_sb->s_fs_info; + + if ((req->ki_pos|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + return ((iov_iter_rw(iter) == WRITE) + ? bch_direct_IO_write + : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); +} + +static ssize_t +bch_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + loff_t pos = iocb->ki_pos; + ssize_t written; + size_t write_len; + pgoff_t end; + + write_len = iov_iter_count(from); + end = (pos + write_len - 1) >> PAGE_SHIFT; + + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + } + + written = mapping->a_ops->direct_IO(iocb, from); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + * + * Augh: this makes no sense for async writes - the second invalidate + * has to come after the new data is visible. But, we can't just move it + * to the end of the dio write path - for async writes we don't have + * i_mutex held anymore, + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } +out: + return written; +} + +static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + ret = file_remove_privs(file); + if (ret) + goto out; + + ret = file_update_time(file); + if (ret) + goto out; + + ret = iocb->ki_flags & IOCB_DIRECT + ? bch_direct_write(iocb, from) + : generic_perform_write(file, from, iocb->ki_pos); + + if (likely(ret > 0)) + iocb->ki_pos += ret; +out: + current->backing_dev_info = NULL; + return ret; +} + +ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __bch_write_iter(iocb, from); + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + struct cache_set *c = inode->i_sb->s_fs_info; + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + + /* + * i_mutex is required for synchronizing with fcollapse(), O_DIRECT + * writes + */ + inode_lock(inode); + + lock_page(page); + if (page->mapping != mapping || + page_offset(page) > i_size_read(inode)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + if (!PageAllocated(page)) { + if (reserve_sectors(c, PAGE_SECTORS)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + + SetPageAllocated(page); + } + + set_page_dirty(page); + wait_for_stable_page(page); +out: + inode_unlock(inode); + sb_end_pagefault(inode->i_sb); + return ret; +} + +void bch_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (offset || length < PAGE_SIZE) + return; + + bch_clear_page_bits(c, ei, page); +} + +int bch_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + bch_clear_page_bits(c, ei, page); + + if (PageDirty(page)) { + ClearPageDirty(page); + cancel_dirty_page(page); + } + + return 1; +} + +#ifdef CONFIG_MIGRATION +int bch_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (PageAllocated(page)) { + ClearPageAllocated(page); + SetPageAllocated(newpage); + } + + if (PageAppend(page)) { + ClearPageAppend(page); + SetPageAppend(newpage); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + +int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + inode_lock(inode); + if (datasync && end <= ei->i_size) + goto out; + + /* + * If there's still outstanding appends, we may have not yet written an + * i_size that exposes the data we just fsynced - however, we can + * advance the i_size on disk up to the end of what we just explicitly + * wrote: + */ + + mutex_lock(&ei->update_lock); + + if (end > ei->i_size && + ei->i_size < inode->i_size) { + struct i_size_update *u; + unsigned idx; + loff_t new_i_size = min_t(u64, inode->i_size, + roundup(end, PAGE_SIZE)); + + BUG_ON(fifo_empty(&ei->i_size_updates)); + BUG_ON(new_i_size < ei->i_size); + + /* + * There can still be a pending i_size update < the size we're + * writing, because it may have been shared with pages > the + * size we fsynced to: + */ + fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) + if (u->new_i_size < new_i_size) + u->new_i_size = -1; + + ret = bch_write_inode_size(c, ei, new_i_size); + } + + mutex_unlock(&ei->update_lock); +out: + inode_unlock(inode); + + if (ret) + return ret; + + if (c->opts.journal_flush_disabled) + return 0; + + return bch_journal_flush_seq(&c->journal, ei->journal_seq); +} + +static int __bch_truncate_page(struct address_space *mapping, + pgoff_t index, loff_t start, loff_t end) +{ + unsigned start_offset = start & (PAGE_SIZE - 1); + unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + struct page *page; + int ret = 0; + + /* Page boundary? Nothing to do */ + if (!((index == start >> PAGE_SHIFT && start_offset) || + (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) + return 0; + + page = find_lock_page(mapping, index); + if (!page) { + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + + /* + * XXX: we're doing two index lookups when we end up reading the + * page + */ + bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, + index << (PAGE_SHIFT - 9))); + k = bch_btree_iter_peek(&iter); + bch_btree_iter_unlock(&iter); + + if (!k.k || + bkey_cmp(bkey_start_pos(k.k), + POS(inode->i_ino, + (index + 1) << (PAGE_SHIFT - 9))) >= 0) + return 0; + + page = find_or_create_page(mapping, + index, + GFP_KERNEL); + if (unlikely(!page)) { + ret = -ENOMEM; + goto out; + } + } + + if (!PageUptodate(page)) + if (bch_read_single_page(page, mapping)) { + ret = -EIO; + goto unlock; + } + + if (index == start >> PAGE_SHIFT && + index == end >> PAGE_SHIFT) + zero_user_segment(page, start_offset, end_offset); + else if (index == start >> PAGE_SHIFT) + zero_user_segment(page, start_offset, PAGE_SIZE); + else if (index == end >> PAGE_SHIFT) + zero_user_segment(page, 0, end_offset); + + set_page_dirty(page); +unlock: + unlock_page(page); + put_page(page); +out: + return ret; +} + +static int bch_truncate_page(struct address_space *mapping, loff_t from) +{ + return __bch_truncate_page(mapping, from >> PAGE_SHIFT, + from, from + PAGE_SIZE); +} + +int bch_truncate(struct inode *inode, struct iattr *iattr) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct i_size_update *u; + bool shrink = iattr->ia_size <= inode->i_size; + unsigned idx; + int ret = 0; + + inode_dio_wait(inode); + + mutex_lock(&ei->update_lock); + + /* + * The new i_size could be bigger or smaller than the current on + * disk size (ei->i_size): + * + * If it's smaller (i.e. we actually are truncating), then in + * order to make the truncate appear atomic we have to write out + * the new i_size before discarding the data to be truncated. + * + * However, if the new i_size is bigger than the on disk i_size, + * then we _don't_ want to write the new i_size here - because + * if there are appends in flight, that would cause us to expose + * the range between the old and the new i_size before those + * appends have completed. + */ + + /* + * First, cancel i_size_updates that extend past the new + * i_size, so the i_size we write here doesn't get + * stomped on: + */ + fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) + if (u->new_i_size > iattr->ia_size) + u->new_i_size = -1; + + set_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); + u = i_size_update_new(ei, iattr->ia_size); + + atomic_long_inc(&u->count); + idx = u - ei->i_size_updates.data; + + if (iattr->ia_size < ei->i_size) + ret = bch_write_inode_size(c, ei, iattr->ia_size); + + mutex_unlock(&ei->update_lock); + + /* + * XXX: if we error, we leak i_size_dirty count - and we can't + * just put it, because it actually is still dirty + */ + if (unlikely(ret)) + return ret; + + /* + * truncate_setsize() does the i_size_write(), can't use + * bch_i_size_write() + */ + EBUG_ON(iattr->ia_size < ei->i_size); + truncate_setsize(inode, iattr->ia_size); + + /* + * There might be persistent reservations (from fallocate()) + * above i_size, which bch_inode_truncate() will discard - we're + * only supposed to discard them if we're doing a real truncate + * here (new i_size < current i_size): + */ + if (shrink) { + ret = bch_truncate_page(inode->i_mapping, iattr->ia_size); + if (unlikely(ret)) + return ret; + + ret = bch_inode_truncate(c, inode->i_ino, + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + NULL, + &ei->journal_seq); + if (unlikely(ret)) + return ret; + } + + setattr_copy(inode, iattr); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + i_size_update_put(c, ei, idx, 1); + return 0; +} + +static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + u64 ino = inode->i_ino; + u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; + u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; + int ret = 0; + + inode_lock(inode); + ret = __bch_truncate_page(inode->i_mapping, + offset >> PAGE_SHIFT, + offset, offset + len); + if (unlikely(ret)) + goto out; + + if (offset >> PAGE_SHIFT != + (offset + len) >> PAGE_SHIFT) { + ret = __bch_truncate_page(inode->i_mapping, + (offset + len) >> PAGE_SHIFT, + offset, offset + len); + if (unlikely(ret)) + goto out; + } + + truncate_pagecache_range(inode, offset, offset + len - 1); + + if (discard_start < discard_end) + ret = bch_discard(c, + POS(ino, discard_start), + POS(ino, discard_end), + 0, NULL, &ei->journal_seq); +out: + inode_unlock(inode); + + return ret; +} + +static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct btree_iter src; + struct btree_iter dst; + BKEY_PADDED(k) copy; + struct bkey_s_c k; + struct i_size_update *u; + loff_t new_size; + unsigned idx; + int ret; + + if ((offset | len) & (PAGE_SIZE - 1)) + return -EINVAL; + + bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, offset >> 9)); + /* position will be set from dst iter's position: */ + bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN); + bch_btree_iter_link(&src, &dst); + + /* + * We need i_mutex to keep the page cache consistent with the extents + * btree, and the btree consistent with i_size - we don't need outside + * locking for the extents btree itself, because we're using linked + * iterators + * + * XXX: hmm, need to prevent reads adding things to the pagecache until + * we're done? + */ + inode_lock(inode); + + ret = -EINVAL; + if (offset + len >= inode->i_size) + goto err; + + if (inode->i_size < len) + goto err; + + new_size = inode->i_size - len; + + inode_dio_wait(inode); + + do { + ret = filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (ret) + goto err; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + offset >> PAGE_SHIFT, + ULONG_MAX); + } while (ret == -EBUSY); + + if (ret) + goto err; + + while (bkey_cmp(dst.pos, + POS(inode->i_ino, + round_up(new_size, PAGE_SIZE) >> 9)) < 0) { + bch_btree_iter_set_pos(&src, + POS(dst.pos.inode, dst.pos.offset + (len >> 9))); + + /* Have to take intent locks before read locks: */ + ret = bch_btree_iter_traverse(&dst); + if (ret) + goto err_unwind; + + k = bch_btree_iter_peek_with_holes(&src); + if (!k.k) { + ret = -EIO; + goto err_unwind; + } + + bkey_reassemble(©.k, k); + + if (bkey_deleted(©.k.k)) + copy.k.k.type = KEY_TYPE_DISCARD; + + bch_cut_front(src.pos, ©.k); + copy.k.k.p.offset -= len >> 9; + + BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); + + ret = bch_btree_insert_at(&dst, + &keylist_single(©.k), + NULL, &ei->journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); + if (ret < 0 && ret != -EINTR) + goto err_unwind; + + bch_btree_iter_unlock(&src); + } + + bch_btree_iter_unlock(&src); + bch_btree_iter_unlock(&dst); + + ret = bch_inode_truncate(c, inode->i_ino, + round_up(new_size, PAGE_SIZE) >> 9, + NULL, &ei->journal_seq); + if (ret) + goto err_unwind; + + mutex_lock(&ei->update_lock); + + /* + * Cancel i_size updates > new_size: + * + * Note: we're also cancelling i_size updates for appends < new_size, and + * writing the new i_size before they finish - would be better to use an + * i_size_update here like truncate, so we can sequence our i_size + * updates with outstanding appends and not have to cancel them: + */ + fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) + u->new_i_size = -1; + + ret = bch_write_inode_size(c, ei, new_size); + bch_i_size_write(inode, new_size); + + truncate_pagecache(inode, offset); + + mutex_unlock(&ei->update_lock); + + inode_unlock(inode); + + return ret; +err_unwind: + BUG(); +err: + bch_btree_iter_unlock(&src); + bch_btree_iter_unlock(&dst); + inode_unlock(inode); + return ret; +} + +static long bch_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_i reservation; + struct bkey_s_c k; + struct bpos end; + loff_t block_start, block_end; + loff_t new_size = offset + len; + unsigned sectors; + int ret; + + bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + inode_lock(inode); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + new_size > inode->i_size) { + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto err; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { + /* just for __bch_truncate_page(): */ + inode_dio_wait(inode); + + ret = __bch_truncate_page(inode->i_mapping, + offset >> PAGE_SHIFT, + offset, offset + len); + + if (!ret && + offset >> PAGE_SHIFT != + (offset + len) >> PAGE_SHIFT) + ret = __bch_truncate_page(inode->i_mapping, + (offset + len) >> PAGE_SHIFT, + offset, offset + len); + + if (unlikely(ret)) + goto err; + + truncate_pagecache_range(inode, offset, offset + len - 1); + + block_start = round_up(offset, PAGE_SIZE); + block_end = round_down(offset + len, PAGE_SIZE); + } else { + block_start = round_down(offset, PAGE_SIZE); + block_end = round_up(offset + len, PAGE_SIZE); + } + + bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9)); + end = POS(inode->i_ino, block_end >> 9); + + while (bkey_cmp(iter.pos, end) < 0) { + unsigned flags = 0; + + k = bch_btree_iter_peek_with_holes(&iter); + if (!k.k) { + ret = bch_btree_iter_unlock(&iter) ?: -EIO; + goto err; + } + + if (bkey_extent_is_data(k.k)) { + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + bch_btree_iter_advance_pos(&iter); + continue; + } + + /* don't check for -ENOSPC if we're deleting data: */ + flags |= BTREE_INSERT_NOFAIL; + } + + bkey_init(&reservation.k); + reservation.k.type = BCH_RESERVATION; + reservation.k.p = k.k->p; + reservation.k.size = k.k->size; + + bch_cut_front(iter.pos, &reservation); + bch_cut_back(end, &reservation.k); + + sectors = reservation.k.size; + + ret = reserve_sectors(c, sectors); + if (ret) + goto err; + + ret = bch_btree_insert_at(&iter, + &keylist_single(&reservation), + NULL, &ei->journal_seq, + BTREE_INSERT_ATOMIC|flags); + + atomic64_sub_bug(sectors, &c->sectors_reserved); + + if (ret < 0 && ret != -EINTR) + goto err; + + } + bch_btree_iter_unlock(&iter); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + new_size > inode->i_size) { + struct i_size_update *u; + unsigned idx; + + mutex_lock(&ei->update_lock); + bch_i_size_write(inode, new_size); + + u = i_size_update_new(ei, new_size); + idx = u - ei->i_size_updates.data; + atomic_long_inc(&u->count); + mutex_unlock(&ei->update_lock); + + i_size_update_put(c, ei, idx, 1); + } + + inode_unlock(inode); + + return 0; +err: + bch_btree_iter_unlock(&iter); + inode_unlock(inode); + return ret; +} + +long bch_fallocate_dispatch(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + return bch_fallocate(inode, mode, offset, len); + + if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + return bch_fpunch(inode, offset, len); + + if (mode == FALLOC_FL_COLLAPSE_RANGE) + return bch_fcollapse(inode, offset, len); + + return -EOPNOTSUPP; +} diff --git a/drivers/md/bcache/fs-io.h b/drivers/md/bcache/fs-io.h new file mode 100644 index 000000000000..cb4574785ca1 --- /dev/null +++ b/drivers/md/bcache/fs-io.h @@ -0,0 +1,77 @@ +#ifndef _BCACHE_FS_IO_H +#define _BCACHE_FS_IO_H + +#include <linux/uio.h> + +int bch_writepage(struct page *, struct writeback_control *); +int bch_readpage(struct file *, struct page *); + +int bch_writepages(struct address_space *, struct writeback_control *); +int bch_readpages(struct file *, struct address_space *, + struct list_head *, unsigned); + +int bch_write_begin(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **); +int bch_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch_direct_IO(struct kiocb *, struct iov_iter *); + +ssize_t bch_write_iter(struct kiocb *, struct iov_iter *); + +int bch_fsync(struct file *, loff_t, loff_t, int); + +int bch_truncate(struct inode *, struct iattr *); +long bch_fallocate_dispatch(struct file *, int, loff_t, loff_t); + +int bch_page_mkwrite(struct vm_area_struct *, struct vm_fault *); +void bch_invalidatepage(struct page *, unsigned int, unsigned int); +int bch_releasepage(struct page *, gfp_t); +int bch_migrate_page(struct address_space *, struct page *, + struct page *, enum migrate_mode); + +struct bch_writepage_io { + struct closure cl; + + struct bch_inode_info *ei; + unsigned long i_size_update_count[I_SIZE_UPDATE_ENTRIES]; + unsigned long sectors_reserved; + + struct bch_write_op op; + /* must come last: */ + struct bch_write_bio bio; +}; + +extern struct bio_set *bch_writepage_bioset; + +struct dio_write { + struct closure cl; + struct kiocb *req; + long written; + long error; + loff_t offset; + bool append; + + struct iovec *iovec; + struct iovec inline_vecs[UIO_FASTIOV]; + struct iov_iter iter; + + struct mm_struct *mm; + + struct bch_write_op iop; + /* must be last: */ + struct bch_write_bio bio; +}; + +extern struct bio_set *bch_dio_write_bioset; + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + struct bio bio; +}; + +extern struct bio_set *bch_dio_read_bioset; + +#endif /* _BCACHE_FS_IO_H */ diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c index 341a6e2e2cea..19544b5db60f 100644 --- a/drivers/md/bcache/fs.c +++ b/drivers/md/bcache/fs.c @@ -6,8 +6,8 @@ #include "dirent.h" #include "extents.h" #include "fs.h" +#include "fs-io.h" #include "inode.h" -#include "io.h" #include "journal.h" #include "keylist.h" #include "super.h" @@ -16,111 +16,14 @@ #include <linux/aio.h> #include <linux/backing-dev.h> #include <linux/compat.h> -#include <linux/falloc.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/statfs.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/uio.h> -#include <linux/writeback.h> #include <linux/xattr.h> -/* - * our page flags: - * - * allocated - page has space on disk reserved for it (c->sectors_reserved) - - * -ENOSPC was checked then, shouldn't be checked later - * - * append - page is dirty from an append write, new i_size can't be written - * until after page is written; ref held on ei->i_size_dirty_count - */ - -#define PF_ANY(page, enforce) page -PAGEFLAG(Allocated, private, PF_ANY) -TESTSCFLAG(Allocated, private, PF_ANY) - -PAGEFLAG(Append, private_2, PF_ANY) -TESTSCFLAG(Append, private_2, PF_ANY) -#undef PF_ANY - -static struct bio_set *bch_writepage_bioset; static struct kmem_cache *bch_inode_cache; static void bch_inode_init(struct bch_inode_info *, struct bkey_s_c_inode); -static int bch_read_single_page(struct page *, struct address_space *); - -#define SECTORS_CACHE 1024 - -static int reserve_sectors(struct cache_set *c, unsigned sectors) -{ - u64 sectors_to_get = SECTORS_CACHE + sectors; - - if (likely(atomic64_sub_return(sectors, - &c->sectors_reserved_cache) >= 0)) - return 0; - - atomic64_add(sectors_to_get, &c->sectors_reserved); - - if (likely(!cache_set_full(c))) { - atomic64_add(sectors_to_get, &c->sectors_reserved_cache); - return 0; - } - - atomic64_sub_bug(sectors_to_get, &c->sectors_reserved); - atomic64_add(sectors, &c->sectors_reserved_cache); - return -ENOSPC; -} - -static void i_size_dirty_put(struct bch_inode_info *ei) -{ - atomic_long_dec_bug(&ei->i_size_dirty_count); -} - -static void i_size_dirty_get(struct bch_inode_info *ei) -{ - lockdep_assert_held(&ei->vfs_inode.i_rwsem); - - atomic_long_inc(&ei->i_size_dirty_count); -} - -static void i_size_update_put(struct cache_set *, - struct bch_inode_info *, - unsigned, unsigned long); - -static void bch_clear_page_bits(struct cache_set *c, struct bch_inode_info *ei, - struct page *page) -{ - EBUG_ON(!PageLocked(page)); - - if (PageAllocated(page)) { - atomic64_sub_bug(PAGE_SECTORS, &c->sectors_reserved); - ClearPageAllocated(page); - } - - if (PageAppend(page)) { - struct bch_page_state *s = (void *) &page->private; - - i_size_update_put(c, ei, s->idx, 1); - ClearPageAppend(page); - } -} - -/* - * In memory i_size should never be < on disk i_size: - */ -static void bch_i_size_write(struct inode *inode, loff_t new_i_size) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - - EBUG_ON(new_i_size < ei->i_size); - i_size_write(inode, new_i_size); -} - -/* returns true if we want to do the update */ -typedef int (*inode_set_fn)(struct bch_inode_info *, - struct bch_inode *, void *); /* * I_SIZE_DIRTY requires special handling: @@ -184,10 +87,10 @@ static void bch_write_inode_checks(struct cache_set *c, } } -static int __must_check __bch_write_inode(struct cache_set *c, - struct bch_inode_info *ei, - inode_set_fn set, - void *p) +int __must_check __bch_write_inode(struct cache_set *c, + struct bch_inode_info *ei, + inode_set_fn set, + void *p) { struct btree_iter iter; struct inode *inode = &ei->vfs_inode; @@ -254,151 +157,6 @@ static int __must_check bch_write_inode(struct cache_set *c, return __bch_write_inode(c, ei, NULL, NULL); } -static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi, - void *p) -{ - loff_t *new_i_size = p; - unsigned i_flags = le32_to_cpu(bi->i_flags); - - lockdep_assert_held(&ei->update_lock); - - bi->i_size = cpu_to_le64(*new_i_size); - - if (atomic_long_read(&ei->i_size_dirty_count)) - i_flags |= BCH_INODE_I_SIZE_DIRTY; - else - i_flags &= ~BCH_INODE_I_SIZE_DIRTY; - - bi->i_flags = cpu_to_le32(i_flags);; - - return 0; -} - -static int __must_check bch_write_inode_size(struct cache_set *c, - struct bch_inode_info *ei, - loff_t new_size) -{ - return __bch_write_inode(c, ei, inode_set_size, &new_size); -} - -static int inode_set_dirty(struct bch_inode_info *ei, - struct bch_inode *bi, void *p) -{ - bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)| - BCH_INODE_I_SIZE_DIRTY); - return 0; -} - -static int check_make_i_size_dirty(struct bch_inode_info *ei, - loff_t offset) -{ - bool need_set_dirty; - unsigned seq; - int ret = 0; - - do { - seq = read_seqcount_begin(&ei->shadow_i_size_lock); - need_set_dirty = offset > ei->i_size && - !(ei->i_flags & BCH_INODE_I_SIZE_DIRTY); - } while (read_seqcount_retry(&ei->shadow_i_size_lock, seq)); - - if (!need_set_dirty) - return 0; - - mutex_lock(&ei->update_lock); - - /* recheck under lock.. */ - - if (offset > ei->i_size && - !(ei->i_flags & BCH_INODE_I_SIZE_DIRTY)) { - struct cache_set *c = ei->vfs_inode.i_sb->s_fs_info; - - ret = __bch_write_inode(c, ei, inode_set_dirty, NULL); - } - - mutex_unlock(&ei->update_lock); - - return ret; -} - -static void i_size_update_put(struct cache_set *c, - struct bch_inode_info *ei, - unsigned idx, - unsigned long count) -{ - struct i_size_update *u = &ei->i_size_updates.data[idx]; - loff_t new_i_size = -1; - long r; - - if (!count) - return; - - r = atomic_long_sub_return(count, &u->count); - BUG_ON(r < 0); - - if (r) - return; - - /* - * Flush i_size_updates entries in order - from the end of the fifo - - * if the entry at the end is finished (refcount has gone to 0): - */ - - mutex_lock(&ei->update_lock); - - while (!fifo_empty(&ei->i_size_updates) && - !atomic_long_read(&(u = &fifo_front(&ei->i_size_updates))->count)) { - struct i_size_update t; - - i_size_dirty_put(ei); - - if (u->new_i_size != -1) { - BUG_ON(u->new_i_size < ei->i_size); - new_i_size = u->new_i_size; - } - - fifo_pop(&ei->i_size_updates, t); - } - - if (new_i_size != -1) { - int ret = bch_write_inode_size(c, ei, new_i_size); - - ret = ret; - /* - * XXX: need to pin the inode in memory if the inode update - * fails - */ - ret = ret; - } - - mutex_unlock(&ei->update_lock); -} - -static struct i_size_update *i_size_update_new(struct bch_inode_info *ei, - loff_t new_size) -{ - struct i_size_update *u; - - lockdep_assert_held(&ei->update_lock); - - if (fifo_empty(&ei->i_size_updates) || - (test_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags) && - !fifo_full(&ei->i_size_updates))) { - clear_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); - fifo_push(&ei->i_size_updates, - (struct i_size_update) { 0 }); - - u = &fifo_back(&ei->i_size_updates); - atomic_long_set(&u->count, 0); - i_size_dirty_get(ei); - } - - u = &fifo_back(&ei->i_size_updates); - u->new_i_size = new_size; - - return u; -} - static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) { struct cache_set *c = sb->s_fs_info; @@ -832,79 +590,6 @@ static int bch_rename2(struct inode *old_dir, struct dentry *old_dentry, return bch_rename(old_dir, old_dentry, new_dir, new_dentry); } -static int __bch_truncate_page(struct address_space *mapping, - pgoff_t index, loff_t start, loff_t end) -{ - unsigned start_offset = start & (PAGE_SIZE - 1); - unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; - struct page *page; - int ret = 0; - - /* Page boundary? Nothing to do */ - if (!((index == start >> PAGE_SHIFT && start_offset) || - (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) - return 0; - - page = find_lock_page(mapping, index); - if (!page) { - struct inode *inode = mapping->host; - struct cache_set *c = inode->i_sb->s_fs_info; - struct btree_iter iter; - struct bkey_s_c k; - - /* - * XXX: we're doing two index lookups when we end up reading the - * page - */ - bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, - index << (PAGE_SHIFT - 9))); - k = bch_btree_iter_peek(&iter); - bch_btree_iter_unlock(&iter); - - if (!k.k || - bkey_cmp(bkey_start_pos(k.k), - POS(inode->i_ino, - (index + 1) << (PAGE_SHIFT - 9))) >= 0) - return 0; - - page = find_or_create_page(mapping, - index, - GFP_KERNEL); - if (unlikely(!page)) { - ret = -ENOMEM; - goto out; - } - } - - if (!PageUptodate(page)) - if (bch_read_single_page(page, mapping)) { - ret = -EIO; - goto unlock; - } - - if (index == start >> PAGE_SHIFT && - index == end >> PAGE_SHIFT) - zero_user_segment(page, start_offset, end_offset); - else if (index == start >> PAGE_SHIFT) - zero_user_segment(page, start_offset, PAGE_SIZE); - else if (index == end >> PAGE_SHIFT) - zero_user_segment(page, 0, end_offset); - - set_page_dirty(page); -unlock: - unlock_page(page); - put_page(page); -out: - return ret; -} - -static int bch_truncate_page(struct address_space *mapping, loff_t from) -{ - return __bch_truncate_page(mapping, from >> PAGE_SHIFT, - from, from + PAGE_SIZE); -} - static int bch_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; @@ -922,85 +607,7 @@ static int bch_setattr(struct dentry *dentry, struct iattr *iattr) return ret; if (iattr->ia_valid & ATTR_SIZE) { - bool shrink = iattr->ia_size <= inode->i_size; - struct i_size_update *u; - unsigned idx; - - inode_dio_wait(inode); - - mutex_lock(&ei->update_lock); - - /* - * The new i_size could be bigger or smaller than the current on - * disk size (ei->i_size): - * - * If it's smaller (i.e. we actually are truncating), then in - * order to make the truncate appear atomic we have to write out - * the new i_size before discarding the data to be truncated. - * - * However, if the new i_size is bigger than the on disk i_size, - * then we _don't_ want to write the new i_size here - because - * if there are appends in flight, that would cause us to expose - * the range between the old and the new i_size before those - * appends have completed. - */ - - /* - * First, cancel i_size_updates that extend past the new - * i_size, so the i_size we write here doesn't get - * stomped on: - */ - fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) - if (u->new_i_size > iattr->ia_size) - u->new_i_size = -1; - - set_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); - u = i_size_update_new(ei, iattr->ia_size); - - atomic_long_inc(&u->count); - idx = u - ei->i_size_updates.data; - - if (iattr->ia_size < ei->i_size) - ret = bch_write_inode_size(c, ei, iattr->ia_size); - - mutex_unlock(&ei->update_lock); - - /* - * XXX: if we error, we leak i_size_dirty count - and we can't - * just put it, because it actually is still dirty - */ - if (unlikely(ret)) - return ret; - - /* - * truncate_setsize() does the i_size_write(), can't use - * bch_i_size_write() - */ - EBUG_ON(iattr->ia_size < ei->i_size); - truncate_setsize(inode, iattr->ia_size); - - /* - * There might be persistent reservations (from fallocate()) - * above i_size, which bch_inode_truncate() will discard - we're - * only supposed to discard them if we're doing a real truncate - * here (new i_size < current i_size): - */ - if (shrink) { - ret = bch_truncate_page(inode->i_mapping, iattr->ia_size); - if (unlikely(ret)) - return ret; - - ret = bch_inode_truncate(c, inode->i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - NULL, &ei->journal_seq); - if (unlikely(ret)) - return ret; - } - - setattr_copy(inode, iattr); - - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - i_size_update_put(c, ei, idx, 1); + ret = bch_truncate(inode, iattr); } else { mutex_lock(&ei->update_lock); setattr_copy(inode, iattr); @@ -1112,482 +719,12 @@ out: return ret < 0 ? ret : 0; } -static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - u64 ino = inode->i_ino; - u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; - u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; - int ret = 0; - - inode_lock(inode); - ret = __bch_truncate_page(inode->i_mapping, - offset >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto out; - - if (offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) { - ret = __bch_truncate_page(inode->i_mapping, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto out; - } - - truncate_pagecache_range(inode, offset, offset + len - 1); - - if (discard_start < discard_end) - ret = bch_discard(c, - POS(ino, discard_start), - POS(ino, discard_end), - 0, NULL, &ei->journal_seq); -out: - inode_unlock(inode); - - return ret; -} - -static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - struct btree_iter src; - struct btree_iter dst; - BKEY_PADDED(k) copy; - struct bkey_s_c k; - struct i_size_update *u; - loff_t new_size; - unsigned idx; - int ret; - - if ((offset | len) & (PAGE_SIZE - 1)) - return -EINVAL; - - bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS, - POS(inode->i_ino, offset >> 9)); - /* position will be set from dst iter's position: */ - bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN); - bch_btree_iter_link(&src, &dst); - - /* - * We need i_mutex to keep the page cache consistent with the extents - * btree, and the btree consistent with i_size - we don't need outside - * locking for the extents btree itself, because we're using linked - * iterators - * - * XXX: hmm, need to prevent reads adding things to the pagecache until - * we're done? - */ - inode_lock(inode); - - ret = -EINVAL; - if (offset + len >= inode->i_size) - goto err; - - if (inode->i_size < len) - goto err; - - new_size = inode->i_size - len; - - inode_dio_wait(inode); - - do { - ret = filemap_write_and_wait_range(inode->i_mapping, - offset, LLONG_MAX); - if (ret) - goto err; - - ret = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - ULONG_MAX); - } while (ret == -EBUSY); - - if (ret) - goto err; - - while (bkey_cmp(dst.pos, - POS(inode->i_ino, - round_up(new_size, PAGE_SIZE) >> 9)) < 0) { - bch_btree_iter_set_pos(&src, - POS(dst.pos.inode, dst.pos.offset + (len >> 9))); - - /* Have to take intent locks before read locks: */ - ret = bch_btree_iter_traverse(&dst); - if (ret) - goto err_unwind; - - k = bch_btree_iter_peek_with_holes(&src); - if (!k.k) { - ret = -EIO; - goto err_unwind; - } - - bkey_reassemble(©.k, k); - - if (bkey_deleted(©.k.k)) - copy.k.k.type = KEY_TYPE_DISCARD; - - bch_cut_front(src.pos, ©.k); - copy.k.k.p.offset -= len >> 9; - - BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); - - ret = bch_btree_insert_at(&dst, - &keylist_single(©.k), - NULL, &ei->journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL); - if (ret < 0 && ret != -EINTR) - goto err_unwind; - - bch_btree_iter_unlock(&src); - } - - bch_btree_iter_unlock(&src); - bch_btree_iter_unlock(&dst); - - ret = bch_inode_truncate(c, inode->i_ino, - round_up(new_size, PAGE_SIZE) >> 9, - NULL, &ei->journal_seq); - if (ret) - goto err_unwind; - - mutex_lock(&ei->update_lock); - - /* - * Cancel i_size updates > new_size: - * - * Note: we're also cancelling i_size updates for appends < new_size, and - * writing the new i_size before they finish - would be better to use an - * i_size_update here like truncate, so we can sequence our i_size - * updates with outstanding appends and not have to cancel them: - */ - fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) - u->new_i_size = -1; - - ret = bch_write_inode_size(c, ei, new_size); - bch_i_size_write(inode, new_size); - - truncate_pagecache(inode, offset); - - mutex_unlock(&ei->update_lock); - - inode_unlock(inode); - - return ret; -err_unwind: - BUG(); -err: - bch_btree_iter_unlock(&src); - bch_btree_iter_unlock(&dst); - inode_unlock(inode); - return ret; -} - -static long bch_fallocate_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - struct btree_iter iter; - struct bkey_i reservation; - struct bkey_s_c k; - struct bpos end; - loff_t block_start, block_end; - loff_t new_size = offset + len; - unsigned sectors; - int ret; - - bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - - inode_lock(inode); - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->i_size) { - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto err; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - /* just for __bch_truncate_page(): */ - inode_dio_wait(inode); - - ret = __bch_truncate_page(inode->i_mapping, - offset >> PAGE_SHIFT, - offset, offset + len); - - if (!ret && - offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) - ret = __bch_truncate_page(inode->i_mapping, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - - if (unlikely(ret)) - goto err; - - truncate_pagecache_range(inode, offset, offset + len - 1); - - block_start = round_up(offset, PAGE_SIZE); - block_end = round_down(offset + len, PAGE_SIZE); - } else { - block_start = round_down(offset, PAGE_SIZE); - block_end = round_up(offset + len, PAGE_SIZE); - } - - bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9)); - end = POS(inode->i_ino, block_end >> 9); - - while (bkey_cmp(iter.pos, end) < 0) { - unsigned flags = 0; - - k = bch_btree_iter_peek_with_holes(&iter); - if (!k.k) { - ret = bch_btree_iter_unlock(&iter) ?: -EIO; - goto err; - } - - if (bkey_extent_is_data(k.k)) { - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - bch_btree_iter_advance_pos(&iter); - continue; - } - - /* don't check for -ENOSPC if we're deleting data: */ - flags |= BTREE_INSERT_NOFAIL; - } - - bkey_init(&reservation.k); - reservation.k.type = BCH_RESERVATION; - reservation.k.p = k.k->p; - reservation.k.size = k.k->size; - - bch_cut_front(iter.pos, &reservation); - bch_cut_back(end, &reservation.k); - - sectors = reservation.k.size; - - ret = reserve_sectors(c, sectors); - if (ret) - goto err; - - ret = bch_btree_insert_at(&iter, - &keylist_single(&reservation), - NULL, &ei->journal_seq, - BTREE_INSERT_ATOMIC|flags); - - atomic64_sub_bug(sectors, &c->sectors_reserved); - - if (ret < 0 && ret != -EINTR) - goto err; - - } - bch_btree_iter_unlock(&iter); - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->i_size) { - struct i_size_update *u; - unsigned idx; - - mutex_lock(&ei->update_lock); - bch_i_size_write(inode, new_size); - - u = i_size_update_new(ei, new_size); - idx = u - ei->i_size_updates.data; - atomic_long_inc(&u->count); - mutex_unlock(&ei->update_lock); - - i_size_update_put(c, ei, idx, 1); - } - - inode_unlock(inode); - - return 0; -err: - bch_btree_iter_unlock(&iter); - inode_unlock(inode); - return ret; -} - -static long bch_fallocate(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct inode *inode = file_inode(file); - - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - return bch_fallocate_fallocate(inode, mode, offset, len); - - if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - return bch_fpunch(inode, offset, len); - - if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch_fcollapse(inode, offset, len); - - return -EOPNOTSUPP; -} - -static int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); - struct address_space *mapping = inode->i_mapping; - struct cache_set *c = inode->i_sb->s_fs_info; - int ret = VM_FAULT_LOCKED; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - - /* - * i_mutex is required for synchronizing with fcollapse(), O_DIRECT - * writes - */ - inode_lock(inode); - - lock_page(page); - if (page->mapping != mapping || - page_offset(page) > i_size_read(inode)) { - unlock_page(page); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (!PageAllocated(page)) { - if (reserve_sectors(c, PAGE_SECTORS)) { - unlock_page(page); - ret = VM_FAULT_SIGBUS; - goto out; - } - - SetPageAllocated(page); - } - - set_page_dirty(page); - wait_for_stable_page(page); -out: - inode_unlock(inode); - sb_end_pagefault(inode->i_sb); - return ret; -} - static const struct vm_operations_struct bch_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = bch_page_mkwrite, }; -static ssize_t -bch_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t written; - size_t write_len; - pgoff_t end; - - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; - - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (mapping->nrpages) { - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - /* - * If a page can not be invalidated, return 0 to fall back - * to buffered write. - */ - if (written) { - if (written == -EBUSY) - return 0; - goto out; - } - } - - written = mapping->a_ops->direct_IO(iocb, from); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - * - * Augh: this makes no sense for async writes - the second invalidate - * has to come after the new data is visible. But, we can't just move it - * to the end of the dio write path - for async writes we don't have - * i_mutex held anymore, - */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } -out: - return written; -} - -static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - ret = file_remove_privs(file); - if (ret) - goto out; - - ret = file_update_time(file); - if (ret) - goto out; - - ret = iocb->ki_flags & IOCB_DIRECT - ? bch_direct_write(iocb, from, iocb->ki_pos) - : generic_perform_write(file, from, iocb->ki_pos); - - if (likely(ret > 0)) - iocb->ki_pos += ret; -out: - current->backing_dev_info = NULL; - return ret; -} - -static ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t ret; - - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret > 0) - ret = __bch_write_iter(iocb, from); - inode_unlock(inode); - - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(iocb, ret); - if (err < 0) - ret = err; - } - return ret; -} - static int bch_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); @@ -1596,65 +733,6 @@ static int bch_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file->f_mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - int ret; - - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - - inode_lock(inode); - if (datasync && end <= ei->i_size) - goto out; - - /* - * If there's still outstanding appends, we may have not yet written an - * i_size that exposes the data we just fsynced - however, we can - * advance the i_size on disk up to the end of what we just explicitly - * wrote: - */ - - mutex_lock(&ei->update_lock); - - if (end > ei->i_size && - ei->i_size < inode->i_size) { - struct i_size_update *u; - unsigned idx; - loff_t new_i_size = min_t(u64, inode->i_size, - roundup(end, PAGE_SIZE)); - - BUG_ON(fifo_empty(&ei->i_size_updates)); - BUG_ON(new_i_size < ei->i_size); - - /* - * There can still be a pending i_size update < the size we're - * writing, because it may have been shared with pages > the - * size we fsynced to: - */ - fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) - if (u->new_i_size < new_i_size) - u->new_i_size = -1; - - ret = bch_write_inode_size(c, ei, new_i_size); - } - - mutex_unlock(&ei->update_lock); -out: - inode_unlock(inode); - - if (ret) - return ret; - - if (c->opts.journal_flush_disabled) - return 0; - - return bch_journal_flush_seq(&c->journal, ei->journal_seq); -} - /* Inode flags: */ static const unsigned bch_inode_flags_to_vfs_flags_map[] = { @@ -1838,8 +916,7 @@ static const struct file_operations bch_file_operations = { .fsync = bch_fsync, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .fallocate = bch_fallocate, - + .fallocate = bch_fallocate_dispatch, .unlocked_ioctl = bch_fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch_compat_fs_ioctl, @@ -1876,7 +953,6 @@ static const struct file_operations bch_dir_file_operations = { .read = generic_read_dir, .iterate = bch_readdir, .fsync = bch_fsync, - .unlocked_ioctl = bch_fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch_compat_fs_ioctl, @@ -1887,7 +963,6 @@ static const struct inode_operations bch_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = bch_setattr, - .listxattr = bch_xattr_list, .get_acl = bch_get_acl, .set_acl = bch_set_acl, @@ -1900,957 +975,21 @@ static const struct inode_operations bch_special_inode_operations = { .set_acl = bch_set_acl, }; -static int bch_bio_add_page(struct bio *bio, struct page *page) -{ - sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); - - BUG_ON(!bio->bi_max_vecs); - - if (!bio->bi_vcnt) - bio->bi_iter.bi_sector = offset; - else if (bio_end_sector(bio) != offset || - bio->bi_vcnt == bio->bi_max_vecs) - return -1; - - bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; - - bio->bi_iter.bi_size += PAGE_SIZE; - - return 0; -} - -static void bch_readpages_end_io(struct bio *bio) -{ - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - - if (!bio->bi_error) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } - - bio_put(bio); -} - -static inline struct page *__readpage_next_page(struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - struct page *page; - int ret; - - while (*nr_pages) { - page = list_entry(pages->prev, struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - - ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS); - - /* if add_to_page_cache_lru() succeeded, page is locked: */ - put_page(page); - - if (!ret) - return page; - - (*nr_pages)--; - } - - return NULL; -} - -#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page) \ - for (; \ - ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\ - (_nr_pages)--) - -static int bch_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct cache_set *c = inode->i_sb->s_fs_info; - struct bio *bio = NULL; - struct page *page; - - pr_debug("reading %u pages", nr_pages); - - for_each_readpage_page(mapping, pages, nr_pages, page) { -again: - if (!bio) { - bio = bio_alloc(GFP_NOFS, - min_t(unsigned, nr_pages, - BIO_MAX_PAGES)); - - bio->bi_end_io = bch_readpages_end_io; - } - - if (bch_bio_add_page(bio, page)) { - bch_read(c, bio, inode->i_ino); - bio = NULL; - goto again; - } - } - - if (bio) - bch_read(c, bio, inode->i_ino); - - pr_debug("success"); - return 0; -} - -static int bch_readpage(struct file *file, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct cache_set *c = inode->i_sb->s_fs_info; - struct bio *bio; - - bio = bio_alloc(GFP_NOFS, 1); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); - bio->bi_end_io = bch_readpages_end_io; - - bch_bio_add_page(bio, page); - bch_read(c, bio, inode->i_ino); - - return 0; -} - -struct bch_writepage_io { - struct closure cl; - - struct bch_inode_info *ei; - unsigned long i_size_update_count[I_SIZE_UPDATE_ENTRIES]; - unsigned long sectors_reserved; - - struct bch_write_op op; - /* must come last: */ - struct bch_write_bio bio; -}; - -struct bch_writepage { - struct cache_set *c; - u64 inum; - struct bch_writepage_io *io; -}; - -static void bch_writepage_io_free(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - struct bio *bio = &io->bio.bio.bio; - - bio_put(bio); -} - -static void bch_writepage_io_done(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - struct cache_set *c = io->op.c; - struct bio *bio = &io->bio.bio.bio; - struct bch_inode_info *ei = io->ei; - struct bio_vec *bvec; - unsigned i; - - atomic64_sub_bug(io->sectors_reserved, &c->sectors_reserved); - - for (i = 0; i < ARRAY_SIZE(io->i_size_update_count); i++) - i_size_update_put(c, ei, i, io->i_size_update_count[i]); - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - BUG_ON(!PageWriteback(page)); - - if (io->bio.bio.bio.bi_error) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - } - - end_page_writeback(page); - } - - closure_return_with_destructor(&io->cl, bch_writepage_io_free); -} - -static void bch_writepage_do_io(struct bch_writepage_io *io) -{ - pr_debug("writing %u sectors to %llu:%llu", - bio_sectors(&io->bio.bio.bio), - io->op.insert_key.k.p.inode, - (u64) io->bio.bio.bio.bi_iter.bi_sector); - - closure_call(&io->op.cl, bch_write, NULL, &io->cl); - continue_at(&io->cl, bch_writepage_io_done, io->op.c->wq); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch_writepage_io_alloc(struct bch_writepage *w, - struct bch_inode_info *ei, - struct page *page) -{ -alloc_io: - if (!w->io) { - struct bio *bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, - bch_writepage_bioset); - w->io = container_of(bio, struct bch_writepage_io, bio.bio.bio); - - closure_init(&w->io->cl, NULL); - w->io->ei = ei; - memset(w->io->i_size_update_count, 0, - sizeof(w->io->i_size_update_count)); - w->io->sectors_reserved = 0; - - bch_write_op_init(&w->io->op, w->c, &w->io->bio, NULL, - bkey_to_s_c(&KEY(w->inum, 0, 0)), - NULL, - &ei->journal_seq, 0); - } - - if (bch_bio_add_page(&w->io->bio.bio.bio, page)) { - bch_writepage_do_io(w->io); - w->io = NULL; - goto alloc_io; - } - - /* - * We shouldn't ever be handed pages for multiple inodes in a single - * pass - right? - */ - BUG_ON(ei != w->io->ei); -} - -static int __bch_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct inode *inode = page->mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct bch_writepage *w = data; - unsigned offset; - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto do_io; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index > end_index || !offset) { - unlock_page(page); - return 0; - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -do_io: - if (check_make_i_size_dirty(ei, page_offset(page) + PAGE_SIZE)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - bch_writepage_io_alloc(w, ei, page); - - /* - * Before unlocking the page, transfer refcounts to w->io: - */ - if (PageAppend(page)) { - struct bch_page_state *s = (void *) &page->private; - - /* - * i_size won't get updated and this write's data made visible - * until the i_size_update this page points to completes - so - * tell the write path to start a new one: - */ - if (&ei->i_size_updates.data[s->idx] == - &fifo_back(&ei->i_size_updates)) - set_bit(BCH_INODE_WANT_NEW_APPEND, &ei->flags); - - w->io->i_size_update_count[s->idx]++; - ClearPageAppend(page); - } - - if (PageAllocated(page)) { - w->io->sectors_reserved += PAGE_SECTORS; - ClearPageAllocated(page); - } - - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - - return 0; -} - -static int bch_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - int ret; - struct bch_writepage w = { - .c = mapping->host->i_sb->s_fs_info, - .inum = mapping->host->i_ino, - .io = NULL, - }; - - ret = write_cache_pages(mapping, wbc, __bch_writepage, &w); - - if (w.io) - bch_writepage_do_io(w.io); - - return ret; -} - -static int bch_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - struct bch_writepage w = { - .c = inode->i_sb->s_fs_info, - .inum = inode->i_ino, - .io = NULL, - }; - - ret = __bch_writepage(page, NULL, &w); - if (ret) - return ret; - - if (w.io) - bch_writepage_do_io(w.io); - - return 0; -} - -static void bch_read_single_page_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int bch_read_single_page(struct page *page, - struct address_space *mapping) -{ - struct inode *inode = mapping->host; - struct cache_set *c = inode->i_sb->s_fs_info; - struct bio *bio; - int ret = 0; - DECLARE_COMPLETION_ONSTACK(done); - - bio = bio_alloc(GFP_NOFS, 1); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); - bio->bi_private = &done; - bio->bi_end_io = bch_read_single_page_end_io; - bch_bio_add_page(bio, page); - - bch_read(c, bio, inode->i_ino); - wait_for_completion(&done); - - if (!ret) - ret = bio->bi_error; - bio_put(bio); - - if (ret < 0) - return ret; - - SetPageUptodate(page); - - return 0; -} - -static int bch_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - struct cache_set *c = inode->i_sb->s_fs_info; - pgoff_t index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - struct page *page; - int ret = 0; - - BUG_ON(inode_unhashed(mapping->host)); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - if (!PageAllocated(page)) { - if (reserve_sectors(c, PAGE_SECTORS)) { - ret = -ENOSPC; - goto err; - } - - SetPageAllocated(page); - } - - if (PageUptodate(page)) - goto out; - - /* If we're writing entire page, don't need to read it in first: */ - if (len == PAGE_SIZE) - goto out; - - if (!offset && pos + len >= inode->i_size) { - zero_user_segment(page, len, PAGE_SIZE); - flush_dcache_page(page); - goto out; - } - - if (index > inode->i_size >> PAGE_SHIFT) { - zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); - flush_dcache_page(page); - goto out; - } - - ret = bch_read_single_page(page, mapping); - if (ret) - goto err; -out: - *pagep = page; - return ret; -err: - unlock_page(page); - put_page(page); - page = NULL; - goto out; -} - -static int bch_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - - lockdep_assert_held(&inode->i_rwsem); - - if (unlikely(copied < len && !PageUptodate(page))) { - /* - * The page needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - * - * userspace doesn't _have_ to redo the write, so clear - * PageAllocated: - */ - copied = 0; - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); - bch_clear_page_bits(c, ei, page); - goto out; - } - - if (!PageUptodate(page)) - SetPageUptodate(page); - if (!PageDirty(page)) - set_page_dirty(page); - - if (pos + copied > inode->i_size) { - struct i_size_update *u; - - /* - * if page already has a ref on a i_size_update, even if it's an - * older one, leave it - they have to be flushed in order so - * that's just as good as taking a ref on a newer one, if we're - * adding a newer one now - * - * - if there's no current i_size_update, or if we want to - * create a new one and there's room for a new one, create it - * - * - set current i_size_update's i_size to new i_size - * - * - if !PageAppend, take a ref on the current i_size_update - */ - - /* XXX: locking */ - mutex_lock(&ei->update_lock); - u = i_size_update_new(ei, pos + copied); - - if (!PageAppend(page)) { - struct bch_page_state *s = (void *) &page->private; - - s->idx = u - ei->i_size_updates.data; - atomic_long_inc(&u->count); - - SetPageAppend(page); - } - - bch_i_size_write(inode, pos + copied); - mutex_unlock(&ei->update_lock); - } -out: - unlock_page(page); - put_page(page); - - return copied; -} - -static void bch_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - struct inode *inode = page->mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - if (offset || length < PAGE_SIZE) - return; - - bch_clear_page_bits(c, ei, page); -} - -static int bch_releasepage(struct page *page, gfp_t gfp_mask) -{ - struct inode *inode = page->mapping->host; - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - bch_clear_page_bits(c, ei, page); - - if (PageDirty(page)) { - ClearPageDirty(page); - cancel_dirty_page(page); - } - - return 1; -} - -/* O_DIRECT */ - -static struct bio_set *bch_dio_read_bioset; -static struct bio_set *bch_dio_write_bioset; - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - struct bio bio; -}; - -static void bch_dio_read_complete(struct closure *cl) -{ - struct dio_read *dio = container_of(cl, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret, 0); - bio_put(&dio->bio); -} - -static void bch_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_error) - dio->ret = bio->bi_error; - - closure_put(&dio->cl); - bio_check_pages_dirty(bio); /* transfers ownership */ -} - -static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req, - struct file *file, struct inode *inode, - struct iov_iter *iter, loff_t offset) -{ - struct dio_read *dio; - struct bio *bio; - unsigned long inum = inode->i_ino; - ssize_t ret = 0; - size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); - bool sync = is_sync_kiocb(req); - loff_t i_size; - - bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_read_bioset); - bio_get(bio); - - dio = container_of(bio, struct dio_read, bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - } - - dio->req = req; - dio->ret = iter->count; - - i_size = i_size_read(inode); - if (offset + dio->ret > i_size) { - dio->ret = max_t(loff_t, 0, i_size - offset); - iter->count = round_up(dio->ret, PAGE_SIZE); - } - - if (!dio->ret) { - closure_put(&dio->cl); - goto out; - } - - goto start; - while (iter->count) { - pages = iov_iter_npages(iter, BIO_MAX_PAGES); - bio = bio_alloc(GFP_KERNEL, pages); -start: - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_end_io = bch_direct_IO_read_endio; - bio->bi_private = dio; - - ret = bio_get_user_pages(bio, iter, 1); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_error = ret; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - bch_read(c, bio, inum); - } -out: - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_put(&dio->bio); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -struct dio_write { - struct closure cl; - struct kiocb *req; - long written; - long error; - loff_t offset; - bool append; - - struct iovec *iovec; - struct iovec inline_vecs[UIO_FASTIOV]; - struct iov_iter iter; - - struct mm_struct *mm; - - struct bch_write_op iop; - /* must be last: */ - struct bch_write_bio bio; -}; - -static void __bch_dio_write_complete(struct dio_write *dio) -{ - inode_dio_end(dio->req->ki_filp->f_inode); - - if (dio->iovec && dio->iovec != dio->inline_vecs) - kfree(dio->iovec); - - bio_put(&dio->bio.bio.bio); -} - -static void bch_dio_write_complete(struct closure *cl) -{ - struct dio_write *dio = container_of(cl, struct dio_write, cl); - struct kiocb *req = dio->req; - long ret = dio->written ?: dio->error; - - __bch_dio_write_complete(dio); - req->ki_complete(req, ret, 0); -} - -static void bch_dio_write_done(struct dio_write *dio) -{ - struct bio_vec *bv; - int i; - - dio->written += dio->iop.written << 9; - - if (dio->iop.error) - dio->error = dio->iop.error; - - bio_for_each_segment_all(bv, &dio->bio.bio.bio, i) - put_page(bv->bv_page); - - if (dio->iter.count) - bio_reset(&dio->bio.bio.bio); -} - -static void bch_do_direct_IO_write(struct dio_write *dio, bool sync) -{ - struct file *file = dio->req->ki_filp; - struct inode *inode = file->f_inode; - struct bch_inode_info *ei = to_bch_ei(inode); - struct cache_set *c = inode->i_sb->s_fs_info; - struct bio *bio = &dio->bio.bio.bio; - unsigned flags = BCH_WRITE_CHECK_ENOSPC; - int ret; - - if (file->f_flags & O_DSYNC || IS_SYNC(file->f_mapping->host)) - flags |= BCH_WRITE_FLUSH; - - while (dio->iter.count) { - bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9; - - ret = bio_get_user_pages(bio, &dio->iter, 0); - if (ret < 0) { - dio->error = ret; - break; - } - - bch_write_op_init(&dio->iop, c, &dio->bio, NULL, - bkey_to_s_c(&KEY(inode->i_ino, - bio_end_sector(bio), - bio_sectors(bio))), - NULL, - &ei->journal_seq, flags); - - task_io_account_write(bio->bi_iter.bi_size); - - closure_call(&dio->iop.cl, bch_write, NULL, &dio->cl); - - if (!sync) - break; - - closure_sync(&dio->cl); - bch_dio_write_done(dio); - } -} - -static void bch_dio_write_loop_async(struct closure *cl) -{ - struct dio_write *dio = - container_of(cl, struct dio_write, cl); - - bch_dio_write_done(dio); - - if (dio->iter.count && !dio->error) { - use_mm(dio->mm); - bch_do_direct_IO_write(dio, false); - unuse_mm(dio->mm); - - continue_at(&dio->cl, - bch_dio_write_loop_async, - dio->iter.count ? system_wq : NULL); - } else { -#if 0 - closure_return_with_destructor(cl, bch_dio_write_complete); -#else - closure_debug_destroy(cl); - bch_dio_write_complete(cl); -#endif - } -} - -static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req, - struct file *file, struct inode *inode, - struct iov_iter *iter, loff_t offset) -{ - struct bch_inode_info *ei = to_bch_ei(inode); - struct dio_write *dio; - struct bio *bio; - size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); - ssize_t ret; - bool sync; - - lockdep_assert_held(&inode->i_rwsem); - - bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_write_bioset); - - dio = container_of(bio, struct dio_write, bio.bio.bio); - dio->req = req; - dio->written = 0; - dio->error = 0; - dio->offset = offset; - dio->append = false; - dio->iovec = NULL; - dio->iter = *iter; - dio->mm = current->mm; - - if (offset + iter->count > inode->i_size) { - /* - * XXX: try and convert this to i_size_update_new(), and maybe - * make async O_DIRECT appends work - */ - - dio->append = true; - i_size_dirty_get(ei); - } - - ret = check_make_i_size_dirty(ei, offset + iter->count); - if (ret) { - if (dio->append) - i_size_dirty_put(ei); - bio_put(bio); - return ret; - } - - closure_init(&dio->cl, NULL); - - inode_dio_begin(inode); - - /* - * appends are sync in order to do the i_size update under - * i_mutex, after we know the write has completed successfully - */ - sync = is_sync_kiocb(req) || dio->append; - - bch_do_direct_IO_write(dio, sync); - - if (sync) { - closure_debug_destroy(&dio->cl); - ret = dio->written ?: dio->error; - - if (dio->append) { - loff_t new_i_size = offset + dio->written; - int ret2 = 0; - - if (dio->written && - new_i_size > inode->i_size) { - struct i_size_update *u; - unsigned idx; - - mutex_lock(&ei->update_lock); - - bch_i_size_write(inode, new_i_size); - - fifo_for_each_entry_ptr(u, &ei->i_size_updates, idx) { - if (u->new_i_size < new_i_size) - u->new_i_size = -1; - else - BUG(); - } - - i_size_dirty_put(ei); - ret2 = bch_write_inode_size(c, ei, new_i_size); - - mutex_unlock(&ei->update_lock); - } else { - i_size_dirty_put(ei); - } - } - - __bch_dio_write_complete(dio); - return ret; - } else { - if (dio->iter.count) { - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iovec = kmalloc(dio->iter.nr_segs * - sizeof(struct iovec), - GFP_KERNEL); - if (!dio->iovec) - dio->error = -ENOMEM; - } else { - dio->iovec = dio->inline_vecs; - } - - memcpy(dio->iovec, - dio->iter.iov, - dio->iter.nr_segs * sizeof(struct iovec)); - dio->iter.iov = dio->iovec; - } - - continue_at_noreturn(&dio->cl, - bch_dio_write_loop_async, - dio->iter.count ? system_wq : NULL); - return -EIOCBQUEUED; - } -} - -static ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct inode *inode = file->f_inode; - struct cache_set *c = inode->i_sb->s_fs_info; - - if ((req->ki_pos|iter->count) & (block_bytes(c) - 1)) - return -EINVAL; - - return ((iov_iter_rw(iter) == WRITE) - ? bch_direct_IO_write - : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); -} - -#ifdef CONFIG_MIGRATION -static int bch_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) -{ - int ret; - - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (PageAllocated(page)) { - ClearPageAllocated(page); - SetPageAllocated(newpage); - } - - if (PageAppend(page)) { - ClearPageAppend(page); - SetPageAppend(newpage); - } - - migrate_page_copy(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -#endif - static const struct address_space_operations bch_address_space_operations = { - .writepage = bch_writepage, - .readpage = bch_readpage, - .writepages = bch_writepages, - .readpages = bch_readpages, - - .set_page_dirty = __set_page_dirty_nobuffers, - - .write_begin = bch_write_begin, - .write_end = bch_write_end, - .invalidatepage = bch_invalidatepage, - .releasepage = bch_releasepage, - - .direct_IO = bch_direct_IO, - + .writepage = bch_writepage, + .readpage = bch_readpage, + .writepages = bch_writepages, + .readpages = bch_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = bch_write_begin, + .write_end = bch_write_end, + .invalidatepage = bch_invalidatepage, + .releasepage = bch_releasepage, + .direct_IO = bch_direct_IO, #ifdef CONFIG_MIGRATION - .migratepage = bch_migrate_page, + .migratepage = bch_migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = generic_error_remove_page, }; static void bch_inode_init(struct bch_inode_info *ei, diff --git a/drivers/md/bcache/fs.h b/drivers/md/bcache/fs.h index 6b08a8895d93..8972d2e360fb 100644 --- a/drivers/md/bcache/fs.h +++ b/drivers/md/bcache/fs.h @@ -42,11 +42,6 @@ enum { BCH_INODE_WANT_NEW_APPEND, }; -/* stored in page->private: */ -struct bch_page_state { - u8 idx; -}; - #define to_bch_ei(_inode) \ container_of(_inode, struct bch_inode_info, vfs_inode) @@ -55,4 +50,11 @@ static inline u8 mode_to_type(umode_t mode) return (mode >> 12) & 15; } +/* returns 0 if we want to do the update, or error is passed up */ +typedef int (*inode_set_fn)(struct bch_inode_info *, + struct bch_inode *, void *); + +int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *, + inode_set_fn, void *); + #endif /* _BCACHE_FS_H */ |