diff options
Diffstat (limited to 'fs')
255 files changed, 8916 insertions, 4481 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 814ac4e213a..0a93dc1cb4a 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -1,6 +1,6 @@ config 9P_FS - tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" - depends on INET && NET_9P && EXPERIMENTAL + tristate "Plan 9 Resource Sharing Support (9P2000)" + depends on INET && NET_9P help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. @@ -10,7 +10,6 @@ config 9P_FS If unsure, say N. if 9P_FS - config 9P_FSCACHE bool "Enable 9P client caching support (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7f6c6770319..8d7f3e69ae2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d) int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) { + dentry_unhash(d); return v9fs_remove(i, d, 1); } @@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + P9_DPRINTK(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 82a7c38ddad..691c78f58be 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, if (IS_ERR(inode_fid)) { err = PTR_ERR(inode_fid); mutex_unlock(&v9inode->v_mutex); - goto error; + goto err_clunk_old_fid; } v9inode->writeback_fid = (void *) inode_fid; } @@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Since we are opening a file, assign the open fid to the file */ filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { - p9_client_clunk(ofid); - return PTR_ERR(filp); + err = PTR_ERR(filp); + goto err_clunk_old_fid; } filp->private_data = ofid; #ifdef CONFIG_9P_FSCACHE @@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, return 0; error: - if (ofid) - p9_client_clunk(ofid); if (fid) p9_client_clunk(fid); +err_clunk_old_fid: + if (ofid) + p9_client_clunk(ofid); return err; } diff --git a/fs/Kconfig b/fs/Kconfig index f3aa9b08b22..19891aab9c6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - bool + tristate config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT @@ -124,6 +124,7 @@ config TMPFS config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" depends on TMPFS + select TMPFS_XATTR select GENERIC_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -134,6 +135,22 @@ config TMPFS_POSIX_ACL If you don't know what Access Control Lists are, say N. +config TMPFS_XATTR + bool "Tmpfs extended attributes" + depends on TMPFS + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + Currently this enables support for the trusted.* and + security.* namespaces. + + You need this for POSIX ACL support on tmpfs. + + If unsure, say N. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index e3e9efc1fdd..03330e2e390 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); + dentry_unhash(dentry); + return affs_remove_header(dentry); } @@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 20c106f2492..2c4e0516004 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + dentry_unhash(dentry); + ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; @@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + vnode = AFS_FS_I(old_dentry->d_inode); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index f55ae23b137..87d95a8cddb 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EACCES; + dentry_unhash(dentry); + if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index b14cebfd904..c7d1d06b048 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct bfs_sb_info *info; int error = -ENOENT; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + old_bh = new_bh = NULL; old_inode = old_dentry->d_inode; if (S_ISDIR(old_inode->i_mode)) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 397d3057d33..1bffbe0ed77 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs) int res; char buf[16]; + memset(&bprm, 0, sizeof(bprm)); + /* Create the file name */ sprintf(buf, "/lib/lib%d.so", id); @@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs) if (!bprm.cred) goto out; + /* We don't really care about recalculating credentials at this point + * as we're past the point of no return and are dealing with shared + * libraries. + */ + bprm.cred_prepared = 1; + res = prepare_binprm(&bprm); if (!IS_ERR_VALUE(res)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 257b00e9842..1f2b1997833 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1120,6 +1120,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto restart; } } + + if (!ret && !bdev->bd_openers) { + bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, bdi); + } + /* * If the device is invalidated, rescan partition * if open succeeded or failed with -ENOMEDIUM. @@ -1130,14 +1139,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) rescan_partitions(disk, bdev); if (ret) goto out_clear; - - if (!bdev->bd_openers) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; - bdev_inode_switch_bdi(bdev->bd_inode, bdi); - } } else { struct block_device *whole; whole = bdget_disk(disk, 0); @@ -1237,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) res = __blkdev_get(bdev, mode, 0); if (whole) { + struct gendisk *disk = whole->bd_disk; + /* finish claiming */ mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); @@ -1263,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) spin_unlock(&bdev_lock); /* - * Block event polling for write claims. Any write - * holder makes the write_holder state stick until all - * are released. This is good enough and tracking - * individual writeable reference is too fragile given - * the way @mode is used in blkdev_get/put(). + * Block event polling for write claims if requested. Any + * write holder makes the write_holder state stick until + * all are released. This is good enough and tracking + * individual writeable reference is too fragile given the + * way @mode is used in blkdev_get/put(). */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && + !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { bdev->bd_write_holder = true; - disk_block_events(bdev->bd_disk); + disk_block_events(disk); } mutex_unlock(&bdev->bd_mutex); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 96fcfa522da..4f9893243da 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -11,6 +11,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/prefetch.h> +#include <linux/cleancache.h> #include "extent_io.h" #include "extent_map.h" #include "compat.h" @@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); + if (!PageUptodate(page)) { + if (cleancache_get_page(page) == 0) { + BUG_ON(blocksize != PAGE_SIZE); + goto out; + } + } + end = page_end; while (1) { lock_extent(tree, start, end, GFP_NOFS); @@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, cur = cur + iosize; page_offset += iosize; } +out: if (!nr) { if (!PageError(page)) SetPageUptodate(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 199a8013431..f340f7c99d0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -709,7 +709,7 @@ again: WARN_ON(cur->checked); if (!list_empty(&cur->upper)) { /* - * the backref was added previously when processsing + * the backref was added previously when processing * backref of type BTRFS_TREE_BLOCK_REF_KEY */ BUG_ON(!list_is_singular(&cur->upper)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0ac712efcdf..be4ffa12f3e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -39,6 +39,7 @@ #include <linux/miscdevice.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/cleancache.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_root = root_dentry; save_mount_options(sb, data); + cleancache_init_fs(sb); return 0; fail_close: diff --git a/fs/buffer.c b/fs/buffer.c index a08bb8e61c6..698c6b2cc46 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <linux/cleancache.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); @@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write); * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. + * + * Direct callers of this function should call vfs_check_frozen() so that page + * fault does not busyloop until the fs is thawed. */ -int -block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; unsigned long end; loff_t size; - int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ + int ret; lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || (page_offset(page) > size)) { - /* page got truncated out from underneath us */ - unlock_page(page); - goto out; + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; } /* page is wholly or partially inside EOF */ @@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (!ret) ret = block_commit_write(page, 0, end); - if (unlikely(ret)) { - unlock_page(page); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else /* -ENOSPC, -EIO, etc */ - ret = VM_FAULT_SIGBUS; - } else - ret = VM_FAULT_LOCKED; - -out: + if (unlikely(ret < 0)) + goto out_unlock; + /* + * Freezing in progress? We check after the page is marked dirty and + * with page lock held so if the test here fails, we are sure freezing + * code will wait during syncing until the page fault is done - at that + * point page will be dirty and unlocked so freezing code will write it + * and writeprotect it again. + */ + set_page_dirty(page); + if (inode->i_sb->s_frozen != SB_UNFROZEN) { + ret = -EAGAIN; + goto out_unlock; + } + return 0; +out_unlock: + unlock_page(page); return ret; } +EXPORT_SYMBOL(__block_page_mkwrite); + +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int ret; + struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; + + /* + * This check is racy but catches the common case. The check in + * __block_page_mkwrite() is reliable. + */ + vfs_check_frozen(sb, SB_FREEZE_WRITE); + ret = __block_page_mkwrite(vma, vmf, get_block); + return block_page_mkwrite_return(ret); +} EXPORT_SYMBOL(block_page_mkwrite); /* diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 38b8ab55492..33da49dc3cc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -848,7 +848,8 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&fsc->client->osdc, req, true); + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); req = NULL; /* continue? */ @@ -880,8 +881,6 @@ release_pvec_pages: out: if (req) ceph_osdc_put_request(req); - if (rc > 0) - rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2a5404c1c42..1f72b00447c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -569,7 +569,8 @@ retry: list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } + } else if (new_cap) + ceph_put_cap(mdsc, new_cap); if (!ci->i_snap_realm) { /* @@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_session *session, int *open_target_sessions) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; unsigned mseq = le32_to_cpu(ex->migrate_seq); @@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, * export targets, so that we get the matching IMPORT */ *open_target_sessions = 1; + + /* + * we can't flush dirty caps that we've seen the + * EXPORT but no IMPORT for + */ + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", + inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap); } @@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ci->i_cap_exporting_issued = 0; ci->i_cap_exporting_mseq = 0; ci->i_cap_exporting_mds = -1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p back to cap_dirty\n", inode); + list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + } + spin_unlock(&mdsc->cap_dirty_lock); } else { dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); @@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = igrab(&ci->vfs_inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); if (inode) { ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, @@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1a867a3601a..33729e822bb 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -360,7 +360,7 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); - while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; @@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int left; + const int bufsize = 1024; if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(1024, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_NOFS); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = - sprintf(cf->dir_info, + snprintf(cf->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e41056174bf..a610d3d6748 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_nfs_fh *fh) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, vino.ino = fh->ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ESTALE); + } dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { @@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + igrab(inode); ceph_mdsc_put_request(req); - inode = ceph_find_inode(sb, vino); if (!inode) return ERR_PTR(err ? err : -ESTALE); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d0fae4ce9ba..79743d146be 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc, if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); @@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; } ceph_mdsc_put_request(req); @@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, { struct super_block *sb = mdsc->fsc->sb; struct inode *inode; - struct ceph_inode_info *ci; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; @@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease no inode %llx\n", vino.ino); goto release; } - ci = ceph_inode(inode); /* dentry */ parent = d_find_alias(inode); @@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e3a9cc0bba..7d8a0d662d5 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -278,6 +278,7 @@ struct ceph_mds_client { u64 cap_flush_seq; struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 2b8dae4d121..a46126fd573 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) int len = de->d_name.len; int error; + dentry_unhash(de); + error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ @@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, int new_length = new_dentry->d_name.len; int error; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); diff --git a/fs/compat.c b/fs/compat.c index 72fe6cda910..0ea00832de2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1306,241 +1306,6 @@ compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int return do_sys_open(dfd, filename, flags, mode); } -/* - * compat_count() counts the number of arguments/envelopes. It is basically - * a copy of count() from fs/exec.c, except that it works with 32 bit argv - * and envp pointers. - */ -static int compat_count(compat_uptr_t __user *argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - compat_uptr_t p; - - if (get_user(p, argv)) - return -EFAULT; - if (!p) - break; - argv++; - if (i++ >= max) - return -E2BIG; - - if (fatal_signal_pending(current)) - return -ERESTARTNOHAND; - cond_resched(); - } - } - return i; -} - -/* - * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c - * except that it works with 32 bit argv and envp pointers. - */ -static int compat_copy_strings(int argc, compat_uptr_t __user *argv, - struct linux_binprm *bprm) -{ - struct page *kmapped_page = NULL; - char *kaddr = NULL; - unsigned long kpos = 0; - int ret; - - while (argc-- > 0) { - compat_uptr_t str; - int len; - unsigned long pos; - - if (get_user(str, argv+argc) || - !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) { - ret = -EFAULT; - goto out; - } - - if (len > MAX_ARG_STRLEN) { - ret = -E2BIG; - goto out; - } - - /* We're going to work our way backwords. */ - pos = bprm->p; - str += len; - bprm->p -= len; - - while (len > 0) { - int offset, bytes_to_copy; - - if (fatal_signal_pending(current)) { - ret = -ERESTARTNOHAND; - goto out; - } - cond_resched(); - - offset = pos % PAGE_SIZE; - if (offset == 0) - offset = PAGE_SIZE; - - bytes_to_copy = offset; - if (bytes_to_copy > len) - bytes_to_copy = len; - - offset -= bytes_to_copy; - pos -= bytes_to_copy; - str -= bytes_to_copy; - len -= bytes_to_copy; - - if (!kmapped_page || kpos != (pos & PAGE_MASK)) { - struct page *page; - - page = get_arg_page(bprm, pos, 1); - if (!page) { - ret = -E2BIG; - goto out; - } - - if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); - kunmap(kmapped_page); - put_page(kmapped_page); - } - kmapped_page = page; - kaddr = kmap(kmapped_page); - kpos = pos & PAGE_MASK; - flush_cache_page(bprm->vma, kpos, - page_to_pfn(kmapped_page)); - } - if (copy_from_user(kaddr+offset, compat_ptr(str), - bytes_to_copy)) { - ret = -EFAULT; - goto out; - } - } - } - ret = 0; -out: - if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); - kunmap(kmapped_page); - put_page(kmapped_page); - } - return ret; -} - -/* - * compat_do_execve() is mostly a copy of do_execve(), with the exception - * that it processes 32 bit argv and envp pointers. - */ -int compat_do_execve(char * filename, - compat_uptr_t __user *argv, - compat_uptr_t __user *envp, - struct pt_regs * regs) -{ - struct linux_binprm *bprm; - struct file *file; - struct files_struct *displaced; - bool clear_in_exec; - int retval; - - retval = unshare_files(&displaced); - if (retval) - goto out_ret; - - retval = -ENOMEM; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - goto out_files; - - retval = prepare_bprm_creds(bprm); - if (retval) - goto out_free; - - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; - current->in_execve = 1; - - file = open_exec(filename); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - - sched_exec(); - - bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; - - retval = bprm_mm_init(bprm); - if (retval) - goto out_file; - - bprm->argc = compat_count(argv, MAX_ARG_STRINGS); - if ((retval = bprm->argc) < 0) - goto out; - - bprm->envc = compat_count(envp, MAX_ARG_STRINGS); - if ((retval = bprm->envc) < 0) - goto out; - - retval = prepare_binprm(bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm->filename, bprm); - if (retval < 0) - goto out; - - bprm->exec = bprm->p; - retval = compat_copy_strings(bprm->envc, envp, bprm); - if (retval < 0) - goto out; - - retval = compat_copy_strings(bprm->argc, argv, bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(bprm, regs); - if (retval < 0) - goto out; - - /* execve succeeded */ - current->fs->in_exec = 0; - current->in_execve = 0; - acct_update_integrals(current); - free_bprm(bprm); - if (displaced) - put_files_struct(displaced); - return retval; - -out: - if (bprm->mm) { - acct_arg_size(bprm, 0); - mmput(bprm->mm); - } - -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - -out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; - current->in_execve = 0; - -out_free: - free_bprm(bprm); - -out_files: - if (displaced) - reset_files_struct(displaced); -out_ret: - return retval; -} - #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 9a37a9b6de3..9d17d350abc 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; + dentry_unhash(dentry); + if (dentry->d_parent == configfs_sb->s_root) return -EPERM; diff --git a/fs/dcache.c b/fs/dcache.c index 18b2a1f10ed..37f72ee5bf7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1220,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent) EXPORT_SYMBOL(shrink_dcache_parent); /* - * Scan `nr' dentries and return the number which remain. + * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain. * * We need to avoid reentering the filesystem if the caller is performing a * GFP_NOFS allocation attempt. One example deadlock is: @@ -1231,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 0d329ff8ed4..9b026ea8baa 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -100,6 +100,7 @@ struct dlm_cluster { unsigned int cl_log_debug; unsigned int cl_protocol; unsigned int cl_timewarn_cs; + unsigned int cl_waitwarn_us; }; enum { @@ -114,6 +115,7 @@ enum { CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, + CLUSTER_ATTR_WAITWARN_US, }; struct cluster_attribute { @@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); +CLUSTER_ATTR(waitwarn_us, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, @@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, + [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, NULL, }; @@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; + cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ +#define DEFAULT_WAITWARN_US 0 struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, @@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = { .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_protocol = DEFAULT_PROTOCOL, - .ci_timewarn_cs = DEFAULT_TIMEWARN_CS + .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, + .ci_waitwarn_us = DEFAULT_WAITWARN_US }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 4f1d6fce58c..dd0ce24d5a8 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -28,6 +28,7 @@ struct dlm_config_info { int ci_log_debug; int ci_protocol; int ci_timewarn_cs; + int ci_waitwarn_us; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index b9420491301..0262451eb9c 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -209,6 +209,7 @@ struct dlm_args { #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 +#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -245,6 +246,7 @@ struct dlm_lkb { int8_t lkb_wait_type; /* type of reply waiting for */ int8_t lkb_wait_count; + int lkb_wait_nodeid; /* for debugging */ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ @@ -254,6 +256,7 @@ struct dlm_lkb { struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; ktime_t lkb_timestamp; + ktime_t lkb_wait_time; unsigned long lkb_timeout_cs; struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 56d6bfcc1e4..f71d0b5abd9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -799,10 +799,84 @@ static int msg_reply_type(int mstype) return -1; } +static int nodeid_warned(int nodeid, int num_nodes, int *warned) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (!warned[i]) { + warned[i] = nodeid; + return 0; + } + if (warned[i] == nodeid) + return 1; + } + return 0; +} + +void dlm_scan_waiters(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + ktime_t zero = ktime_set(0, 0); + s64 us; + s64 debug_maxus = 0; + u32 debug_scanned = 0; + u32 debug_expired = 0; + int num_nodes = 0; + int *warned = NULL; + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_equal(lkb->lkb_wait_time, zero)) + continue; + + debug_scanned++; + + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); + + if (us < dlm_config.ci_waitwarn_us) + continue; + + lkb->lkb_wait_time = zero; + + debug_expired++; + if (us > debug_maxus) + debug_maxus = us; + + if (!num_nodes) { + num_nodes = ls->ls_num_nodes; + warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); + if (warned) + memset(warned, 0, num_nodes * sizeof(int)); + } + if (!warned) + continue; + if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) + continue; + + log_error(ls, "waitwarn %x %lld %d us check connection to " + "node %d", lkb->lkb_id, (long long)us, + dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); + } + mutex_unlock(&ls->ls_waiters_mutex); + + if (warned) + kfree(warned); + + if (debug_expired) + log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", + debug_scanned, debug_expired, + dlm_config.ci_waitwarn_us, (long long)debug_maxus); +} + /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ -static int add_to_waiters(struct dlm_lkb *lkb, int mstype) +static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error = 0; @@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; + lkb->lkb_wait_time = ktime_get(); + lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); out: @@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; - if (ms != &ls->ls_stub_ms) + if (ms->m_flags != DLM_IFL_STUB_MS) mutex_lock(&ls->ls_waiters_mutex); error = _remove_from_waiters(lkb, ms->m_type, ms); - if (ms != &ls->ls_stub_ms) + if (ms->m_flags != DLM_IFL_STUB_MS) mutex_unlock(&ls->ls_waiters_mutex); return error; } @@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_to_us(lkb->lkb_wait_time)) + lkb->lkb_wait_time = ktime_get(); + } + mutex_unlock(&ls->ls_waiters_mutex); } /* lkb is master or local copy */ @@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become compatible with other granted locks */ -static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms) +static void munge_demoted(struct dlm_lkb *lkb) { - if (ms->m_type != DLM_MSG_CONVERT_REPLY) { - log_print("munge_demoted %x invalid reply type %d", - lkb->lkb_id, ms->m_type); - return; - } - if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { log_print("munge_demoted %x invalid modes gr %d rq %d", lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); @@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) struct dlm_mhandle *mh; int to_nodeid, error; - error = add_to_waiters(lkb, mstype); + to_nodeid = r->res_nodeid; + + error = add_to_waiters(lkb, mstype, to_nodeid); if (error) return error; - to_nodeid = r->res_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); if (error) goto fail; @@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) /* down conversions go without a reply from the master */ if (!error && down_conversion(lkb)) { remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS; r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; r->res_ls->ls_stub_ms.m_result = 0; - r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags; __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); } @@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) struct dlm_mhandle *mh; int to_nodeid, error; - error = add_to_waiters(lkb, DLM_MSG_LOOKUP); + to_nodeid = dlm_dir_nodeid(r); + + error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); if (error) return error; - to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); if (error) goto fail; @@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) { + if (ms->m_flags == DLM_IFL_STUB_MS) + return; + lkb->lkb_sbflags = ms->m_sbflags; lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | (ms->m_flags & 0x0000FFFF); @@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, /* convert was queued on remote master */ receive_flags_reply(lkb, ms); if (is_demoted(lkb)) - munge_demoted(lkb, ms); + munge_demoted(lkb); del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); add_timeout(lkb); @@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, /* convert was granted on remote master */ receive_flags_reply(lkb, ms); if (is_demoted(lkb)) - munge_demoted(lkb, ms); + munge_demoted(lkb); grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); break; @@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) dlm_put_lockspace(ls); } -static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms_stub) { if (middle_conversion(lkb)) { hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; - ls->ls_stub_ms.m_result = -EINPROGRESS; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_convert_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CONVERT_REPLY; + ms_stub->m_result = -EINPROGRESS; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_convert_reply(lkb, ms_stub); /* Same special case as in receive_rcom_lock_args() */ lkb->lkb_grmode = DLM_LOCK_IV; @@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) void dlm_recover_waiters_pre(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; + struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; + ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message)); + if (!ms_stub) { + log_error(ls, "dlm_recover_waiters_pre no mem"); + return; + } + mutex_lock(&ls->ls_waiters_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { - log_debug(ls, "pre recover waiter lkid %x type %d flags %x", - lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags); + + /* exclude debug messages about unlocks because there can be so + many and they aren't very interesting */ + + if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { + log_debug(ls, "recover_waiter %x nodeid %d " + "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_wait_type, lkb->lkb_wait_nodeid); + } /* all outstanding lookups, regardless of destination will be resent after recovery is done */ @@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) break; case DLM_MSG_CONVERT: - recover_convert_waiter(ls, lkb); + recover_convert_waiter(ls, lkb, ms_stub); break; case DLM_MSG_UNLOCK: hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; - ls->ls_stub_ms.m_result = stub_unlock_result; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_unlock_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_UNLOCK_REPLY; + ms_stub->m_result = stub_unlock_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_unlock_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; case DLM_MSG_CANCEL: hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; - ls->ls_stub_ms.m_result = stub_cancel_result; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_cancel_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CANCEL_REPLY; + ms_stub->m_result = stub_cancel_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_cancel_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; @@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) schedule(); } mutex_unlock(&ls->ls_waiters_mutex); + kfree(ms_stub); } static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) @@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) ou = is_overlap_unlock(lkb); err = 0; - log_debug(ls, "recover_waiters_post %x type %d flags %x %s", - lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); + log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", + lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); /* At this point we assume that we won't get a reply to any previous op or overlap op on this lock. First, do a big diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 88e93c80cc2..265017a7c3e 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); +void dlm_scan_waiters(struct dlm_ls *ls); void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f994a7dfda8..14cbf409975 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void) static int dlm_scand(void *data) { struct dlm_ls *ls; - int timeout_jiffies = dlm_config.ci_scan_secs * HZ; while (!kthread_should_stop()) { ls = find_ls_to_scan(); @@ -252,13 +251,14 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); + dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; } - } else { - schedule_timeout_interruptible(timeout_jiffies); + continue; } + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } diff --git a/fs/dlm/main.c b/fs/dlm/main.c index b80e0aa3cfa..5a59efa0bb4 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -50,7 +50,7 @@ static int __init init_dlm(void) if (error) goto out_netlink; - printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); + printk("DLM installed\n"); return 0; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 30d8b85febb..e2b87800436 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -71,6 +71,36 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } +/* If a process was killed while waiting for the only plock on a file, + locks_remove_posix will not see any lock on the file so it won't + send an unlock-close to us to pass on to userspace to clean up the + abandoned waiter. So, we have to insert the unlock-close when the + lock call is interrupted. */ + +static void do_unlock_close(struct dlm_ls *ls, u64 number, + struct file *file, struct file_lock *fl) +{ + struct plock_op *op; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) + return; + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = 0; + op->info.end = OFFSET_MAX; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); +} + int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, int cmd, struct file_lock *fl) { @@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); - if (xop->callback == NULL) - wait_event(recv_wq, (op->done != 0)); - else { + if (xop->callback == NULL) { + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + log_debug(ls, "dlm_posix_lock: wait killed %llx", + (unsigned long long)number); + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + kfree(xop); + do_unlock_close(ls, number, file, fl); + goto out; + } + } else { rv = FILE_LOCK_DEFERRED; goto out; } @@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, else op->info.owner = (__u64)(long) fl->fl_owner; + if (fl->fl_flags & FL_CLOSE) { + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); + rv = 0; + goto out; + } + send_op(op); wait_event(recv_wq, (op->done != 0)); @@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { op = list_entry(send_list.next, struct plock_op, list); - list_move(&op->list, &recv_list); + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + list_del(&op->list); + else + list_move(&op->list, &recv_list); memcpy(&info, &op->info, sizeof(info)); } spin_unlock(&ops_lock); @@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (!op) return -EAGAIN; + /* there is no need to get a reply from userspace for unlocks + that were generated by the vfs cleaning up for a close + (the process did not make an unlock call). */ + + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + kfree(op); + if (copy_to_user(u, &info, sizeof(info))) return -EFAULT; return sizeof(info); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d5ab3fe7c19..e96bf3e9be8 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, out_sig: sigprocmask(SIG_SETMASK, &tmpsig, NULL); - recalc_sigpending(); out_free: kfree(kbuf); return error; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 98b77c89494..c00e055b628 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) static void drop_slab(void) { int nr_objects; + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; do { - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + nr_objects = shrink_slab(&shrink, 1000, 1000); } while (nr_objects > 10); } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 4d4cc6a90cd..227b409b840 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct dentry *lower_dir_dentry; int rc; + dentry_unhash(dentry); + lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); @@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry; struct dentry *trap = NULL; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); diff --git a/fs/exec.c b/fs/exec.c index 8328beb9016..ea5f748906a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -42,7 +42,6 @@ #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> -#include <linux/proc_fs.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -55,6 +54,7 @@ #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -166,8 +166,13 @@ out: } #ifdef CONFIG_MMU - -void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +/* + * The nascent bprm->mm is not visible until exec_mmap() but it can + * use a lot of memory, account these pages in current->mm temporary + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we + * change the counter back via acct_arg_size(0). + */ +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); @@ -186,7 +191,7 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) #endif } -struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -194,7 +199,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_STACK_GROWSUP if (write) { - ret = expand_stack_downwards(bprm->vma, pos); + ret = expand_downwards(bprm->vma, pos); if (ret < 0) return NULL; } @@ -305,11 +310,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } -struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -398,22 +403,56 @@ err: return err; } +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + compat_uptr_t __user *compat; +#endif + } ptr; +}; + +static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) +{ + const char __user *native; + +#ifdef CONFIG_COMPAT + if (unlikely(argv.is_compat)) { + compat_uptr_t compat; + + if (get_user(compat, argv.ptr.compat + nr)) + return ERR_PTR(-EFAULT); + + return compat_ptr(compat); + } +#endif + + if (get_user(native, argv.ptr.native + nr)) + return ERR_PTR(-EFAULT); + + return native; +} + /* * count() counts the number of strings in array ARGV. */ -static int count(const char __user * const __user * argv, int max) +static int count(struct user_arg_ptr argv, int max) { int i = 0; - if (argv != NULL) { + if (argv.ptr.native != NULL) { for (;;) { - const char __user * p; + const char __user *p = get_user_arg_ptr(argv, i); - if (get_user(p, argv)) - return -EFAULT; if (!p) break; - argv++; + + if (IS_ERR(p)) + return -EFAULT; + if (i++ >= max) return -E2BIG; @@ -430,7 +469,7 @@ static int count(const char __user * const __user * argv, int max) * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ -static int copy_strings(int argc, const char __user *const __user *argv, +static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; @@ -443,16 +482,18 @@ static int copy_strings(int argc, const char __user *const __user *argv, int len; unsigned long pos; - if (get_user(str, argv+argc) || - !(len = strnlen_user(str, MAX_ARG_STRLEN))) { - ret = -EFAULT; + ret = -EFAULT; + str = get_user_arg_ptr(argv, argc); + if (IS_ERR(str)) goto out; - } - if (!valid_arg_len(bprm, len)) { - ret = -E2BIG; + len = strnlen_user(str, MAX_ARG_STRLEN); + if (!len) + goto out; + + ret = -E2BIG; + if (!valid_arg_len(bprm, len)) goto out; - } /* We're going to work our way backwords. */ pos = bprm->p; @@ -519,14 +560,19 @@ out: /* * Like copy_strings, but get argv and its values from kernel memory. */ -int copy_strings_kernel(int argc, const char *const *argv, +int copy_strings_kernel(int argc, const char *const *__argv, struct linux_binprm *bprm) { int r; mm_segment_t oldfs = get_fs(); + struct user_arg_ptr argv = { + .ptr.native = (const char __user *const __user *)__argv, + }; + set_fs(KERNEL_DS); - r = copy_strings(argc, (const char __user *const __user *)argv, bprm); + r = copy_strings(argc, argv, bprm); set_fs(oldfs); + return r; } EXPORT_SYMBOL(copy_strings_kernel); @@ -553,7 +599,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; - struct mmu_gather *tlb; + struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -579,12 +625,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(tlb, new_end, old_end, new_end, + free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -593,10 +639,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(tlb, old_start, old_end, new_end, + free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } - tlb_finish_mmu(tlb, new_end, old_end); + tlb_finish_mmu(&tlb, new_end, old_end); /* * Shrink the vma to just the new range. Always succeeds. @@ -1004,6 +1050,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk) task_unlock(tsk); return buf; } +EXPORT_SYMBOL_GPL(get_task_comm); void set_task_comm(struct task_struct *tsk, char *buf) { @@ -1379,10 +1426,10 @@ EXPORT_SYMBOL(search_binary_handler); /* * sys_execve() executes a new program. */ -int do_execve(const char * filename, - const char __user *const __user *argv, - const char __user *const __user *envp, - struct pt_regs * regs) +static int do_execve_common(const char *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + struct pt_regs *regs) { struct linux_binprm *bprm; struct file *file; @@ -1489,6 +1536,34 @@ out_ret: return retval; } +int do_execve(const char *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execve_common(filename, argv, envp, regs); +} + +#ifdef CONFIG_COMPAT +int compat_do_execve(char *filename, + compat_uptr_t __user *__argv, + compat_uptr_t __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execve_common(filename, argv, envp, regs); +} +#endif + void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; @@ -1548,6 +1623,41 @@ expand_fail: return ret; } +static int cn_print_exe_file(struct core_name *cn) +{ + struct file *exe_file; + char *pathbuf, *path, *p; + int ret; + + exe_file = get_mm_exe_file(current->mm); + if (!exe_file) + return cn_printf(cn, "(unknown)"); + + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + if (!pathbuf) { + ret = -ENOMEM; + goto put_exe_file; + } + + path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + goto free_buf; + } + + for (p = path; *p; p++) + if (*p == '/') + *p = '!'; + + ret = cn_printf(cn, "%s", path); + +free_buf: + kfree(pathbuf); +put_exe_file: + fput(exe_file); + return ret; +} + /* format_corename will inspect the pattern parameter, and output a * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. @@ -1619,6 +1729,9 @@ static int format_corename(struct core_name *cn, long signr) case 'e': err = cn_printf(cn, "%s", current->comm); break; + case 'E': + err = cn_print_exe_file(cn); + break; /* core limit size */ case 'c': err = cn_printf(cn, "%lu", diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0a78dae7e2c..1dd62ed35b8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if (!sb_set_blocksize(sb, blocksize)) { - ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); + ext2_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto failed_sbi; } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 32f3b869585..34b6d9bfc48 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir, handle_t *handle; struct inode * inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir, dquot_initialize(dir); + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext3_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2211,21 +2237,45 @@ retry: if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT3_I(inode)->i_data)) { + if (l > EXT3_N_BLOCKS * 4) { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); /* - * page_symlink() calls into ext3_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext3_orphan_del(handle, inode); if (err) { + ext3_journal_stop(handle); drop_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; + goto err_drop_inode; } } else { inode->i_op = &ext3_fast_symlink_inode_operations; @@ -2239,6 +2289,10 @@ out_stop: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext3_link (struct dentry * old_dentry, diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 3c6a9e0eadc..aad153ef6b7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -36,6 +36,7 @@ #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/log2.h> +#include <linux/cleancache.h> #include <asm/uaccess.h> @@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, } else { ext3_msg(sb, KERN_INFO, "using internal journal"); } + cleancache_init_fs(sb); return res; } diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index c947e36eda6..04109460ba9 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1c67139ad4b..264f6949511 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group - * @handle: handle to this transaction - * @sb: super block - * @block: start physcial block to add to the block group - * @count: number of blocks to free - * - * This marks the blocks as free in the bitmap. We ask the - * mballoc to reload the buddy after this by setting group - * EXT4_GROUP_INFO_NEED_INIT_BIT flag - */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - ext4_group_t block_group; - ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; - - ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - - ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - goto error_return; - } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) - goto error_return; - - if (in_range(ext4_block_bitmap(sb, desc), block, count) || - in_range(ext4_inode_bitmap(sb, desc), block, count) || - in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || - in_range(block + count - 1, ext4_inode_table(sb, desc), - sbi->s_itb_per_group)) { - ext4_error(sb, "Adding blocks in system zones - " - "Block = %llu, count = %lu", - block, count); - goto error_return; - } - - /* - * We are about to add blocks to the bitmap, - * so we need undo access. - */ - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext4_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - /* - * make sure we don't allow a parallel init on other groups in the - * same buddy cache - */ - down_write(&grp->alloc_sem); - for (i = 0, blocks_freed = 0; i < count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), - bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - blocks_freed++; - } - } - ext4_lock_group(sb, block_group); - blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); - ext4_free_blks_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(blocks_freed, - &sbi->s_flex_groups[flex_group].free_blocks); - } - /* - * request to reload the buddy with the - * new bitmap information - */ - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - grp->bb_free += blocks_freed; - up_write(&grp->alloc_sem); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, err); - return; -} - -/** * ext4_has_free_blocks() * @sbi: in-core super block structure. * @nblocks: number of needed blocks @@ -493,7 +369,8 @@ error_return: * Check if filesystem has nblocks free & available for allocation. * On success return 1, return 0 on failure. */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags) { s64 free_blocks, dirty_blocks, root_blocks; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; @@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) EXT4_FREEBLOCKS_WATERMARK) { free_blocks = percpu_counter_sum_positive(fbc); dirty_blocks = percpu_counter_sum_positive(dbc); - if (dirty_blocks < 0) { - printk(KERN_CRIT "Dirty block accounting " - "went wrong %lld\n", - (long long)dirty_blocks); - } } /* Check whether we have space after * accounting for current dirty blocks & root reserved blocks. @@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) /* Hm, nope. Are (enough) root reserved blocks available? */ if (sbi->s_resuid == current_fsuid() || ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE)) { + capable(CAP_SYS_RESOURCE) || + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_blocks >= (nblocks + dirty_blocks)) return 1; } @@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) } int ext4_claim_free_blocks(struct ext4_sb_info *sbi, - s64 nblocks) + s64 nblocks, unsigned int flags) { - if (ext4_has_free_blocks(sbi, nblocks)) { + if (ext4_has_free_blocks(sbi, nblocks, flags)) { percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); return 0; } else @@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || + if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) || (*retries)++ > 3 || !EXT4_SB(sb)->s_journal) return 0; @@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; @@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; + ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4daaf2b753f..a74b89c09f9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 - +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -209,6 +210,8 @@ struct ext4_io_submit { */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ @@ -512,6 +515,10 @@ struct ext4_new_group_data { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Punch out blocks of an extent */ +#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* * Flags used by ext4_free_blocks @@ -1028,7 +1035,7 @@ struct ext4_super_block { __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ @@ -1144,6 +1151,9 @@ struct ext4_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif + /* ext4 extent cache stats */ + unsigned long extent_cache_hits; + unsigned long extent_cache_misses; /* for buddy allocator */ struct ext4_group_info ***s_group_info; @@ -1201,6 +1211,9 @@ struct ext4_sb_info { struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, */ struct ext4_lazy_init { unsigned long li_state; - - wait_queue_head_t li_wait_daemon; - wait_queue_head_t li_wait_task; - struct timer_list li_timer; - struct task_struct *li_task; - struct list_head li_request_list; struct mutex li_list_mtx; }; @@ -1615,6 +1639,67 @@ struct ext4_features { }; /* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* * Function prototypes */ @@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); -extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count); + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks, unsigned int flags); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, @@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ @@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int, __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ + __LINE__, msg) extern void __ext4_grp_locked_error(const char *, unsigned int, \ struct super_block *, ext4_group_t, \ unsigned long, ext4_fsblk_t, \ @@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); +extern int ext4_ext_punch_hole(struct file *file, loff_t offset, + loff_t length); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, @@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc); +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6e272ef6ba9..f5240aa1560 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,20 +6,6 @@ #include <trace/events/ext4.h> -int __ext4_journal_get_undo_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_undo_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, bh, - handle, err); - } - return err; -} - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index d0f53538a57..bb85757689b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); -int __ext4_journal_get_undo_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); @@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, int __ext4_handle_dirty_super(const char *where, unsigned int line, handle_t *handle, struct super_block *sb); -#define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4890d6f3ad1..5199bac7fc6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -46,6 +46,13 @@ #include <trace/events/ext4.h> +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex, int *err) + struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); return newblock; } @@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) } ext_debug("\n"); } + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", level, + le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), + newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext, int at) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; - struct ext4_extent *ex; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; @@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err); + newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; - ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != @@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } /* start copy from next extent */ - /* TODO: we could do it by single memmove */ - m = 0; - path[depth].p_ext++; - while (path[depth].p_ext <= - EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", - le32_to_cpu(path[depth].p_ext->ee_block), - ext4_ext_pblock(path[depth].p_ext), - ext4_ext_is_uninitialized(path[depth].p_ext), - ext4_ext_get_actual_len(path[depth].p_ext), - newblock); - /*memmove(ex++, path[depth].p_ext++, - sizeof(struct ext4_extent)); - neh->eh_entries++;*/ - path[depth].p_ext++; - m++; - } + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); if (m) { - memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } @@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); - /* copy indexes */ - m = 0; - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, - EXT_MAX_INDEX(path[i].p_hdr)); + /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, @@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", i, - le32_to_cpu(path[i].p_idx->ei_block), - ext4_idx_pblock(path[i].p_idx), - newblock); - /*memmove(++fidx, path[i].p_idx++, - sizeof(struct ext4_extent_idx)); - neh->eh_entries++; - BUG_ON(neh->eh_entries > neh->eh_max);*/ - path[i].p_idx++; - m++; - } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); if (m) { - memmove(++fidx, path[i].p_idx - m, + memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } @@ -1056,8 +1073,9 @@ cleanup: * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; @@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); if (newblock == 0) return err; @@ -1140,8 +1159,9 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext) + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1161,7 +1181,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, path, newext, i); + err = ext4_ext_split(handle, inode, flags, path, newext, i); if (err) goto out; @@ -1174,7 +1194,8 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, path, newext); + err = ext4_ext_grow_indepth(handle, inode, flags, + path, newext); if (err) goto out; @@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ -static int ext4_ext_try_to_merge(struct inode *inode, +static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { @@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode, } /* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + int ret = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + ret = ext4_ext_try_to_merge_right(inode, path, ex); + + return ret; +} + +/* * check if a portion of the "newext" extent overlaps with an * existing extent. * @@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; + int flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1742,7 +1789,9 @@ repeat: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - err = ext4_ext_create_new_leaf(handle, inode, path, newext); + if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) + flags = EXT4_MB_USE_ROOT_BLOCKS; + err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * cache extent pointer. If the cached extent is a hole, + * this routine should be used instead of + * ext4_ext_in_cache if the calling function needs to + * know the size of the hole. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * * Return 0 if cache is invalid; 1 if the cache is valid */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ +static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_cache *ex){ struct ext4_ext_cache *cex; + struct ext4_sb_info *sbi; int ret = 0; /* @@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; + sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) goto errout; if (in_range(block, cex->ec_block, cex->ec_len)) { - ex->ee_block = cpu_to_le32(cex->ec_block); - ext4_ext_store_pblock(ex, cex->ec_start); - ex->ee_len = cpu_to_le16(cex->ec_len); + memcpy(ex, cex, sizeof(struct ext4_ext_cache)); ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); ret = 1; } errout: + if (!ret) + sbi->extent_cache_misses++; + else + sbi->extent_cache_hits++; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; } /* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * extent pointer. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ + struct ext4_ext_cache cex; + int ret = 0; + + if (ext4_ext_check_cache(inode, block, &cex)) { + ex->ee_block = cpu_to_le32(cex.ec_block); + ext4_ext_store_pblock(ex, cex.ec_start); + ex->ee_len = cpu_to_le16(cex.ec_len); + ret = 1; + } + + return ret; +} + + +/* * ext4_ext_rm_idx: * removes index from the index block. * It's used in truncate case only, thus all requests are for @@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); + /* head removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = to - from; + start = ext4_ext_pblock(ex); + + ext_debug("free first %u blocks starting %llu\n", num, start); + ext4_free_blocks(handle, inode, 0, start, num, flags); + } else { printk(KERN_INFO "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, return 0; } + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end", and splits the extents + * if "start" and "end" appear in the same extent + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @start: The first block to remove + * @end: The last block to remove + */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t start) + struct ext4_ext_path *path, ext4_lblk_t start, + ext4_lblk_t end) { int err = 0, correct_index = 0; int depth = ext_depth(inode), credits; @@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; + struct ext4_map_blocks map; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf\n", start); @@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? - ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; ext_debug(" border %u:%u\n", a, b); - if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { - block = 0; - num = 0; - BUG(); + /* If this extent is beyond the end of the hole, skip it */ + if (end <= ex_ee_block) { + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (a != ex_ee_block && + b != ex_ee_block + ex_ee_len - 1) { + /* + * If this is a truncate, then this condition should + * never happen because at least one of the end points + * needs to be on the edge of the extent. + */ + if (end == EXT_MAX_BLOCK) { + ext_debug(" bad truncate %u:%u\n", + start, end); + block = 0; + num = 0; + err = -EIO; + goto out; + } + /* + * else this is a hole punch, so the extent needs to + * be split since neither edge of the hole is on the + * extent edge + */ + else{ + map.m_pblk = ext4_ext_pblock(ex); + map.m_lblk = ex_ee_block; + map.m_len = b - ex_ee_block; + + err = ext4_split_extent(handle, + inode, path, &map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (err < 0) + goto out; + + ex_ee_len = ext4_ext_get_actual_len(ex); + + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + /* Then remove tail of this extent */ + block = ex_ee_block; + num = a - block; + } } else if (a != ex_ee_block) { /* remove tail of the extent */ block = ex_ee_block; num = a - block; } else if (b != ex_ee_block + ex_ee_len - 1) { /* remove head of the extent */ - block = a; - num = b - a; - /* there is no "make a hole" API yet */ - BUG(); + block = b; + num = ex_ee_block + ex_ee_len - b; + + /* + * If this is a truncate, this condition + * should never happen + */ + if (end == EXT_MAX_BLOCK) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } } else { /* remove whole extent: excellent! */ block = ex_ee_block; num = 0; - BUG_ON(a != ex_ee_block); - BUG_ON(b != ex_ee_block + ex_ee_len - 1); + if (a != ex_ee_block) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } + + if (b != ex_ee_block + ex_ee_len - 1) { + ext_debug(" bad truncate %u:%u\n", + start, end); + err = -EIO; + goto out; + } } /* @@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (num == 0) { /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - le16_add_cpu(&eh->eh_entries, -1); + } else if (block != ex_ee_block) { + /* + * If this was a head removal, then we need to update + * the physical block since it is now at a different + * location + */ + ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); } ex->ee_block = cpu_to_le32(block); @@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCK) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } + ext_debug("new extent: %u:%u:%llu\n", block, num, ext4_ext_pblock(ex)); ex--; @@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2365,7 +2574,8 @@ again: while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ - err = ext4_ext_rm_leaf(handle, inode, path, start); + err = ext4_ext_rm_leaf(handle, inode, path, + start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) return ret; } +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or uninit) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags) +{ + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNINIT1) + ext4_ext_mark_uninitialized(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex2); + + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le32(ee_len); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } else if (err) + goto fix_extent_len; + +out: + ext4_ext_show_leaf(inode, path); + return err; + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (upto three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags) +{ + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int err = 0; + int uninitialized; + int split_flag1, flags1; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + uninitialized = ext4_ext_is_uninitialized(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) + goto out; + } + + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) + return PTR_ERR(path); + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (split_flag & EXT4_EXT_MARK_UNINIT2) + split_flag1 |= EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk, split_flag1, flags); + if (err) + goto out; + } + + ext4_ext_show_leaf(inode, path); +out: + return err ? err : map->m_len; +} + #define EXT4_EXT_ZERO_LEN 7 /* * This function is called by ext4_ext_map_blocks() if someone tries to write @@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_map_blocks *map, struct ext4_ext_path *path) { - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex1 = NULL; - struct ext4_extent *ex2 = NULL; - struct ext4_extent *ex3 = NULL; - struct ext4_extent_header *eh; + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex; + struct ext4_extent *ex; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; - ext4_fsblk_t newblock; int err = 0; - int ret = 0; - int may_zeroout; + int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); - - ex2 = ex; - orig_ex.ee_block = ex->ee_block; - orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); + WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ - may_zeroout = ee_block + ee_len <= eof_block; + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, ex); if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - return allocated; - } - - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ - if (map->m_lblk > ee_block) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * for sanity, update the length of the ex2 extent before - * we insert ex3, if ex1 is NULL. This is to avoid temporary - * overlap of blocks. - */ - if (!ex1 && allocated > map->m_len) - ex2->ee_len = cpu_to_le16(map->m_len); - /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > map->m_len) { - unsigned int newdepth; - /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ - if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { - /* - * map->m_lblk == ee_block is handled by the zerouout - * at the beginning. - * Mark first half uninitialized. - * Mark second half initialized and zero out the - * initialized extent - */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = cpu_to_le16(ee_len - allocated); - ext4_ext_mark_uninitialized(ex); - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex3, newblock); - ex3->ee_len = cpu_to_le16(allocated); - err = ext4_ext_insert_extent(handle, inode, path, - ex3, 0); - if (err == -ENOSPC) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, - ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - - /* - * We need to zero out the second half because - * an fallocate request can update file size and - * converting the second half to initialized extent - * implies that we can leak some junk data to user - * space. - */ - err = ext4_ext_zeroout(inode, ex3); - if (err) { - /* - * We should actually mark the - * second half as uninit and return error - * Insert would have changed the extent - */ - depth = ext_depth(inode); - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, - path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - return err; - } - /* get the second half extent details */ - ex = path[depth].p_ext; - err = ext4_ext_get_access(handle, inode, - path + depth); - if (err) - return err; - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; - } - - /* zeroed the second half */ - return allocated; - } - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); - ext4_ext_store_pblock(ex3, newblock + map->m_len); - ex3->ee_len = cpu_to_le16(allocated - map->m_len); - ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - /* - * The depth, and hence eh & ex might change - * as part of the insert above. - */ - newdepth = ext_depth(inode); - /* - * update the extent length after successful insert of the - * split extent - */ - ee_len -= ext4_ext_get_actual_len(ex3); - orig_ex.ee_len = cpu_to_le16(ee_len); - may_zeroout = ee_block + ee_len <= eof_block; - - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); goto out; - } - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - - allocated = map->m_len; - - /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying - * to insert a extent in the middle zerout directly - * otherwise give the extent a chance to merge to left - */ - if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && - map->m_lblk != ee_block && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - /* blocks available from map->m_lblk */ - return allocated; - } - } - /* - * If there was a change of depth as part of the - * insertion of ex3 above, we need to update the length - * of the ex1 extent again here - */ - if (ex1 && ex1 != ex) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ - ex2->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex2, newblock); - ex2->ee_len = cpu_to_le16(allocated); - if (ex2 != ex) - goto insert; - /* - * New (initialized) extent starts from the first block - * in the current extent. i.e., ex2 == ex - * We have to see if it can be merged with the extent - * on the left. - */ - if (ex2 > EXT_FIRST_EXTENT(eh)) { - /* - * To merge left, pass "ex2 - 1" to try_to_merge(), - * since it merges towards right _only_. - */ - ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - ex2--; - } + ext4_ext_mark_initialized(ex); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; } + /* - * Try to Merge towards right. This might be required - * only when the whole extent is being written to. - * i.e. ex2 == ex and ex3 == NULL. + * four cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the first half. + * 3. split the extent into two extents, zeroout the second half. + * 4. split the extent into two extents with out zeroout. */ - if (!ex3) { - ret = ext4_ext_try_to_merge(inode, path, ex2); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (allocated > map->m_len) { + if (allocated <= EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 3 */ + zero_ex.ee_block = + cpu_to_le32(map->m_lblk); + zero_ex.ee_len = cpu_to_le16(allocated); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex) + map->m_lblk - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex); if (err) goto out; + split_map.m_lblk = map->m_lblk; + split_map.m_len = allocated; + } else if ((map->m_lblk - ee_block + map->m_len < + EXT4_EXT_ZERO_LEN) && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 2 */ + if (map->m_lblk != ee_block) { + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(map->m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + } + + split_map.m_lblk = ee_block; + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; } } - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; -insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - return allocated; - } else if (err) - goto fix_extent_len; + + allocated = ext4_split_extent(handle, inode, path, + &split_map, split_flag, 0); + if (allocated < 0) + err = allocated; + out: - ext4_ext_show_leaf(inode, path); return err ? err : allocated; - -fix_extent_len: - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; } /* @@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_ext_path *path, int flags) { - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex1 = NULL; - struct ext4_extent *ex2 = NULL; - struct ext4_extent *ex3 = NULL; - ext4_lblk_t ee_block, eof_block; - unsigned int allocated, ee_len, depth; - ext4_fsblk_t newblock; - int err = 0; - int may_zeroout; + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; ext_debug("ext4_split_unwritten_extents: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle, inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); - - ex2 = ex; - orig_ex.ee_block = ex->ee_block; - orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); - /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ - may_zeroout = ee_block + ee_len <= eof_block; - - /* - * If the uninitialized extent begins at the same logical - * block where the write begins, and the write completely - * covers the extent, then we don't need to split it. - */ - if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) - return allocated; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ - if (map->m_lblk > ee_block) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * for sanity, update the length of the ex2 extent before - * we insert ex3, if ex1 is NULL. This is to avoid temporary - * overlap of blocks. - */ - if (!ex1 && allocated > map->m_len) - ex2->ee_len = cpu_to_le16(map->m_len); - /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > map->m_len) { - unsigned int newdepth; - ex3 = &newex; - ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); - ext4_ext_store_pblock(ex3, newblock + map->m_len); - ex3->ee_len = cpu_to_le16(allocated - map->m_len); - ext4_ext_mark_uninitialized(ex3); - err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ - /* blocks available from map->m_lblk */ - return allocated; - - } else if (err) - goto fix_extent_len; - /* - * The depth, and hence eh & ex might change - * as part of the insert above. - */ - newdepth = ext_depth(inode); - /* - * update the extent length after successful insert of the - * split extent - */ - ee_len -= ext4_ext_get_actual_len(ex3); - orig_ex.ee_len = cpu_to_le16(ee_len); - may_zeroout = ee_block + ee_len <= eof_block; - - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= EXT4_EXT_MARK_UNINIT2; - allocated = map->m_len; - } - /* - * If there was a change of depth as part of the - * insertion of ex3 above, we need to update the length - * of the ex1 extent again here - */ - if (ex1 && ex1 != ex) { - ex1 = ex; - ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); - ext4_ext_mark_uninitialized(ex1); - ex2 = &newex; - } - /* - * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written - * using direct I/O, uninitialised still. - */ - ex2->ee_block = cpu_to_le32(map->m_lblk); - ext4_ext_store_pblock(ex2, newblock); - ex2->ee_len = cpu_to_le16(allocated); - ext4_ext_mark_uninitialized(ex2); - if (ex2 != ex) - goto insert; - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); - ext_debug("out here\n"); - goto out; -insert: - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC && may_zeroout) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_dirty(handle, inode, path + depth); - /* zero out the first half */ - return allocated; - } else if (err) - goto fix_extent_len; -out: - ext4_ext_show_leaf(inode, path); - return err ? err : allocated; - -fix_extent_len: - ex->ee_block = orig_ex.ee_block; - ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); - ext4_ext_mark_uninitialized(ex); - ext4_ext_dirty(handle, inode, path + depth); - return err; + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags); } + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_extent_header *eh; int depth; int err = 0; - int ret = 0; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); - /* - * We have to see if it can be merged with the extent - * on the left. - */ - if (ex > EXT_FIRST_EXTENT(eh)) { - /* - * To merge left, pass "ex - 1" to try_to_merge(), - * since it merges towards right _only_. - */ - ret = ext4_ext_try_to_merge(inode, path, ex - 1); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - ex--; - } - } - /* - * Try to Merge towards right. + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed */ - ret = ext4_ext_try_to_merge(inode, path, ex); - if (ret) { - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto out; - depth = ext_depth(inode); - } + ext4_ext_try_to_merge(inode, path, ex); + /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); out: @@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; + unsigned int punched_out = 0; + unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + struct ext4_map_blocks punch_map; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && + ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* @@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - /* Do not put uninitialized extent in the cache */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; + if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { + /* + * Do not put uninitialized extent + * in the cache + */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; + } + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; } - ret = ext4_ext_handle_uninitialized_extents(handle, - inode, map, path, flags, allocated, - newblock); - return ret; + + /* + * Punch out the map length, but only to the + * end of the extent + */ + punched_out = allocated < map->m_len ? + allocated : map->m_len; + + /* + * Sense extents need to be converted to + * uninitialized, they must fit in an + * uninitialized extent + */ + if (punched_out > EXT_UNINIT_MAX_LEN) + punched_out = EXT_UNINIT_MAX_LEN; + + punch_map.m_lblk = map->m_lblk; + punch_map.m_pblk = newblock; + punch_map.m_len = punched_out; + punch_map.m_flags = 0; + + /* Check to see if the extent needs to be split */ + if (punch_map.m_len != ee_len || + punch_map.m_lblk != ee_block) { + + ret = ext4_split_extent(handle, inode, + path, &punch_map, 0, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT | + EXT4_GET_BLOCKS_PRE_IO); + + if (ret < 0) { + err = ret; + goto out2; + } + /* + * find extent for the block at + * the start of the hole + */ + ext4_ext_drop_refs(path); + kfree(path); + + path = ext4_ext_find_extent(inode, + map->m_lblk, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + + } + + ext4_ext_mark_uninitialized(ex); + + err = ext4_ext_remove_space(inode, map->m_lblk, + map->m_lblk + punched_out); + + goto out2; } } @@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, else /* disable in-core preallocation for non-regular files */ ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -3529,7 +3647,11 @@ out2: } trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, newblock, map->m_len, err ? err : allocated); - return err ? err : allocated; + + result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? + punched_out : allocated; + + return err ? err : result; } void ext4_ext_truncate(struct inode *inode) @@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode) if (IS_SYNC(inode)) ext4_handle_sync(handle); -out_stop: up_write(&EXT4_I(inode)->i_data_sem); + +out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - /* * currently supporting (pre)allocate mode for extent-based * files _only_ @@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(file, offset, len); + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -3691,7 +3817,8 @@ retry: break; } ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_NO_NORMALIZE); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, pgoff_t last_offset; pgoff_t offset; pgoff_t index; + pgoff_t start_index = 0; struct page **pages = NULL; struct buffer_head *bh = NULL; struct buffer_head *head = NULL; @@ -3848,39 +3976,57 @@ out: kfree(pages); return EXT_CONTINUE; } + index = 0; +next_page: /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[0]->index << PAGE_SHIFT) >> + end = ((__u64)pages[index]->index << PAGE_SHIFT) >> blksize_bits; - if (!page_has_buffers(pages[0])) + if (!page_has_buffers(pages[index])) goto out; - head = page_buffers(pages[0]); + head = page_buffers(pages[index]); if (!head) goto out; + index++; bh = head; do { - if (buffer_mapped(bh)) { + if (end >= newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + + if (buffer_mapped(bh) && + end >= newex->ec_block) { + start_index = index - 1; /* get the 1st mapped buffer. */ - if (end > newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; goto found_mapped_buffer; } + bh = bh->b_this_page; end++; } while (bh != head); - /* No mapped buffer found. */ - goto out; + /* No mapped buffer in the range found in this page, + * We need to look up next page. + */ + if (index >= ret) { + /* There is no page left, but we need to limit + * newex->ec_len. + */ + newex->ec_len = end - newex->ec_block; + goto out; + } + goto next_page; } else { /*Find contiguous delayed buffers. */ if (ret > 0 && pages[0]->index == last_offset) head = page_buffers(pages[0]); bh = head; + index = 1; + start_index = 0; } found_mapped_buffer: @@ -3903,7 +4049,7 @@ found_mapped_buffer: end++; } while (bh != head); - for (index = 1; index < ret; index++) { + for (; index < ret; index++) { if (!page_has_buffers(pages[index])) { bh = NULL; break; @@ -3913,8 +4059,10 @@ found_mapped_buffer: bh = NULL; break; } + if (pages[index]->index != - pages[0]->index + index) { + pages[start_index]->index + index + - start_index) { /* Blocks are not contiguous. */ bh = NULL; break; @@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } +/* + * ext4_ext_punch_hole + * + * Punches a hole of "length" bytes in a file starting + * at byte "offset" + * + * @inode: The inode of the file to punch a hole in + * @offset: The starting byte offset of the hole + * @length: The length of the hole + * + * Returns the number of blocks removed or negative on err + */ +int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_ext_cache cache_ex; + ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + struct address_space *mapping = inode->i_mapping; + struct ext4_map_blocks map; + handle_t *handle; + loff_t first_block_offset, last_block_offset, block_len; + loff_t first_page, last_page, first_page_offset, last_page_offset; + int ret, credits, blocks_released, err = 0; + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); + last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + first_page_offset == 0 ? 0 : first_page_offset-1, + last_page_offset); + + if (err) + return err; + } + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(mapping, first_page_offset, + last_page_offset-1); + } + + /* finish any pending end_io work */ + ext4_flush_completed_IO(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err) + goto out; + + /* + * Now we need to zero out the un block aligned data. + * If the file is smaller than a block, just + * zero out the middle + */ + if (first_block > last_block) + ext4_block_zero_page_range(handle, mapping, offset, length); + else { + /* zero out the head of the hole before the first block */ + block_len = first_block_offset - offset; + if (block_len > 0) + ext4_block_zero_page_range(handle, mapping, + offset, block_len); + + /* zero out the tail of the hole after the last block */ + block_len = offset + length - last_block_offset; + if (block_len > 0) { + ext4_block_zero_page_range(handle, mapping, + last_block_offset, block_len); + } + } + + /* If there are no blocks to remove, return now */ + if (first_block >= last_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + + /* + * Loop over all the blocks and identify blocks + * that need to be punched out + */ + iblock = first_block; + blocks_released = 0; + while (iblock < last_block) { + max_blocks = last_block - iblock; + num_blocks = 1; + memset(&map, 0, sizeof(map)); + map.m_lblk = iblock; + map.m_len = max_blocks; + ret = ext4_ext_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (ret > 0) { + blocks_released += ret; + num_blocks = ret; + } else if (ret == 0) { + /* + * If map blocks could not find the block, + * then it is in a hole. If the hole was + * not already cached, then map blocks should + * put it in the cache. So we can get the hole + * out of the cache + */ + memset(&cache_ex, 0, sizeof(cache_ex)); + if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && + !cache_ex.ec_start) { + + /* The hole is cached */ + num_blocks = cache_ex.ec_block + + cache_ex.ec_len - iblock; + + } else { + /* The block could not be identified */ + err = -EIO; + break; + } + } else { + /* Map blocks error */ + err = ret; + break; + } + + if (num_blocks == 0) { + /* This condition should never happen */ + ext_debug("Block lookup failed"); + err = -EIO; + break; + } + + iblock += num_blocks; + } + + if (blocks_released > 0) { + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + ext4_orphan_del(handle, inode); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + return err; +} int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { @@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } - diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7b80d543b89..2c097232200 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = { }; const struct inode_operations ext4_file_inode_operations = { - .truncate = ext4_truncate, .setattr = ext4_setattr, .getattr = ext4_getattr, #ifdef CONFIG_EXT4_FS_XATTR diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e9473cbe80d..ce66d2fe826 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -36,7 +36,7 @@ static void dump_completed_IO(struct inode * inode) { -#ifdef EXT4_DEBUG +#ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; unsigned long flags; @@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret; tid_t commit_tid; + bool needs_barrier = false; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync) } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (jbd2_log_start_commit(journal, commit_tid)) { - /* - * When the journal is on a different device than the - * fs data disk, we need to issue the barrier in - * writeback mode. (In ordered mode, the jbd2 layer - * will take care of issuing the barrier. In - * data=journal, all of the data blocks are written to - * the journal device.) - */ - if (ext4_should_writeback_data(inode) && - (journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, - NULL); - ret = jbd2_log_wait_commit(journal, commit_tid); - } else if (journal->j_flags & JBD2_BARRIER) + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + jbd2_log_start_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); + if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); out: trace_ext4_sync_file_exit(inode, ret); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f2fa5e8a582..50d0e9c6458 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, - goal, &count, err); + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); if (*err) goto failed_out; @@ -1930,7 +1930,7 @@ repeat: * We do still charge estimated metadata to the sb though; * we cannot afford to run out of free blocks. */ - if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) { dquot_release_reservation_block(inode, 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); @@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping, continue; } - if (PageWriteback(page)) - wait_on_page_writeback(page); - + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); if (mpd->next_page != page->index) @@ -3513,7 +3511,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext4_truncate_failed_write(inode); } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode) int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, length, pos; + unsigned blocksize, max, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle, return -EINVAL; blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); + max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) @@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, int ext4_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode) } /* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on sucess or negative on failure + */ + +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + if (!S_ISREG(inode->i_mode)) + return -ENOTSUPP; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + /* TODO: Add support for non extent hole punching */ + return -ENOTSUPP; + } + + return ext4_ext_punch_hole(file, offset, length); +} + +/* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire @@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode, /* * Figure out the offset within the block group inode table */ - inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((inode->i_ino - 1) % EXT4_INODES_PER_GROUP(sb)); block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); @@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size || - (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { + (attr->ia_size < inode->i_size)) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } - /* ext4_truncate will clear the flag */ - if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) - ext4_truncate(inode); } - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) - rc = vmtruncate(inode, attr->ia_size); + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + ext4_truncate(inode); + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + ext4_truncate(inode); + } if (!rc) { setattr_copy(inode, attr); @@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; } ret = 0; - if (PageMappedToDisk(page)) - goto out_unlock; + + lock_page(page); + wait_on_page_writeback(page); + if (PageMappedToDisk(page)) { + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; + } if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; - lock_page(page); /* * return if we have all the buffers mapped. This avoid * the need to call write_begin/write_end which does a @@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { - unlock_page(page); - goto out_unlock; + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; } } unlock_page(page); @@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret < 0) goto out_unlock; ret = 0; + + /* + * write_begin/end might have created a dirty page and someone + * could wander in and start the IO. Make sure that hasn't + * happened. + */ + lock_page(page); + wait_on_page_writeback(page); + up_read(&inode->i_alloc_sem); + return VM_FAULT_LOCKED; out_unlock: if (ret) ret = VM_FAULT_SIGBUS; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d8a16eecf1d..859f2ae8864 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct inode *inode; char *data; char *bitmap; + struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); @@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (first_group + i >= ngroups) break; + grinfo = ext4_get_group_info(sb, first_group + i); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + err = -EIO; desc = ext4_get_group_desc(sb, first_group + i, NULL); if (desc == NULL) @@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } /* wait for I/O completion */ - for (i = 0; i < groups_per_page && bh[i]; i++) - wait_on_buffer(bh[i]); + for (i = 0; i < groups_per_page; i++) + if (bh[i]) + wait_on_buffer(bh[i]); err = -EIO; - for (i = 0; i < groups_per_page && bh[i]; i++) - if (!buffer_uptodate(bh[i])) + for (i = 0; i < groups_per_page; i++) + if (bh[i] && !buffer_uptodate(bh[i])) goto out; err = 0; first_block = page->index * blocks_per_page; - /* init the page */ - memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; - struct ext4_group_info *grinfo; group = (first_block + i) >> 1; if (group >= ngroups) break; + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + /* * data carry information regarding this * particular group in the format specified @@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; @@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) out: if (bh) { - for (i = 0; i < groups_per_page && bh[i]; i++) + for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -957,22 +974,21 @@ out: } /* - * lock the group_info alloc_sem of all the groups - * belonging to the same buddy cache page. This - * make sure other parallel operation on the buddy - * cache doesn't happen whild holding the buddy cache - * lock + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ -static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, - ext4_group_t group) +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) { - int i; - int block, pnum; + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; int blocks_per_page; - int groups_per_page; - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t first_group; - struct ext4_group_info *grp; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; /* @@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, */ block = group * 2; pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - if ((first_group + i) >= ngroups) - break; - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - down_write_nested(&grp->alloc_sem, i); + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; } - return i; + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; } -static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, - ext4_group_t group, int locked_group) +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - int i; - int block, pnum; - int blocks_per_page; - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - /* release locks on all the groups */ - for (i = 0; i < locked_group; i++) { - - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - up_write(&grp->alloc_sem); + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); } - } /* @@ -1044,93 +1043,60 @@ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { - int ret = 0; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; mb_debug(1, "init group %u\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have taken the alloc_sem lock. + * would have pinned buddy page to page cache. */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ - ret = 0; goto err; } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { + if (e4b.bd_buddy_page == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); + ret = 0; + goto err; } - if (page == NULL || !PageUptodate(page)) { + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } mark_page_accessed(page); err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); + ext4_mb_put_buddy_page_lock(&e4b); return ret; } @@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; - e4b->alloc_semp = &grp->alloc_sem; - - /* Take the read lock on the group alloc - * sem. This would make sure a parallel - * ext4_mb_init_group happening on other - * groups mapped by the page is blocked - * till we are done with allocation - */ -repeat_load_buddy: - down_read(e4b->alloc_semp); if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - /* we need to check for group need init flag - * with alloc_semp held so that we can be sure - * that new blocks didn't get added to the group - * when we are loading the buddy cache - */ - up_read(e4b->alloc_semp); /* * we need full data about the group * to make a good selection @@ -1189,7 +1139,6 @@ repeat_load_buddy: ret = ext4_mb_init_group(sb, group); if (ret) return ret; - goto repeat_load_buddy; } /* @@ -1273,15 +1222,14 @@ repeat_load_buddy: return 0; err: + if (page) + page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - - /* Done with the buddy cache */ - up_read(e4b->alloc_semp); return ret; } @@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); - /* Done with the buddy cache */ - if (e4b->alloc_semp) - up_read(e4b->alloc_semp); } @@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - /* on allocation we use ac to track the held semaphore */ - ac->alloc_semp = e4b->alloc_semp; - e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, ret, count = 0, count2 = 0; + int err, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) { - ret = ext4_issue_discard(sb, entry->group, - entry->start_blk, entry->count); - if (unlikely(ret == -EOPNOTSUPP)) { - ext4_warning(sb, "discard not supported, " - "disabling"); - clear_opt(sb, DISCARD); - } - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); } } - if (ac->alloc_semp) - up_read(ac->alloc_semp); if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding - * doesn't grow big. We need to release - * alloc_semp before calling ext4_mb_add_n_trim() + * doesn't grow big. */ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); @@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ - while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + while (ar->len && + ext4_claim_free_blocks(sbi, ar->len, ar->flags)) { + /* let others to free the space */ yield(); ar->len = ar->len >> 1; @@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, ar->len); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, ar->len)) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } } inquota = ar->len; if (ar->len == 0) { @@ -4704,6 +4645,127 @@ error_return: } /** + * ext4_add_groupblocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + goto error_return; + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group @@ -4715,11 +4777,10 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static void ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; - int ret = 0; assert_spin_locked(ext4_group_lock_ptr(sb, group)); @@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - - ret = ext4_issue_discard(sb, group, start, count); - + ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); - return ret; } /** @@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, * the group buddy bitmap. This is done until whole group is scanned. */ static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, - ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) { void *bitmap; ext4_grpblk_t next, count = 0; - ext4_group_t group; - int ret = 0; + struct ext4_buddy e4b; + int ret; - BUG_ON(e4b == NULL); + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; - bitmap = e4b->bd_bitmap; - group = e4b->bd_group; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; ext4_lock_group(sb, group); + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; while (start < max) { start = mb_find_next_zero_bit(bitmap, max, start); @@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, next = mb_find_next_bit(bitmap, max, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, e4b); - if (ret < 0) - break; + ext4_trim_extent(sb, start, + next - start, group, &e4b); count += next - start; } start = next + 1; @@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ext4_lock_group(sb, group); } - if ((e4b->bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - count) < minblocks) break; } ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", count, group); - if (ret < 0) - count = ret; - return count; } @@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct ext4_buddy e4b; + struct ext4_group_info *grp; ext4_group_t first_group, last_group; ext4_group_t group, ngroups = ext4_get_groups_count(sb); ext4_grpblk_t cnt = 0, first_block, last_block; - uint64_t start, len, minlen, trimmed; + uint64_t start, len, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); int ret = 0; @@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> sb->s_blocksize_bits; len = range->len >> sb->s_blocksize_bits; minlen = range->minlen >> sb->s_blocksize_bits; - trimmed = 0; if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; @@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) return -EINVAL; for (group = first_group; group <= last_group; group++) { - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); - break; + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; } /* @@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) last_block = first_block + len; len -= last_block - first_block; - if (e4b.bd_info->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, &e4b, first_block, + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_block, last_block, minlen); if (cnt < 0) { ret = cnt; - ext4_mb_unload_buddy(&e4b); break; } } - ext4_mb_unload_buddy(&e4b); trimmed += cnt; first_block = 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 22bd4d7f289..20b5e7bfebd 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -193,11 +193,6 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; - /* - * pointer to the held semaphore upon successful - * block allocation - */ - struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -215,7 +210,6 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; - struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 92816b4e0f1..b57b98fb44d 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c new file mode 100644 index 00000000000..9bdef3f537c --- /dev/null +++ b/fs/ext4/mmp.c @@ -0,0 +1,351 @@ +#include <linux/fs.h> +#include <linux/random.h> +#include <linux/buffer_head.h> +#include <linux/utsname.h> +#include <linux/kthread.h> + +#include "ext4.h" + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (!*bh) { + ext4_warning(sb, "Error while reading MMP block %llu", + mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update " + "node: %s, last update device: %s\n", + (long long unsigned int) le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = ((struct mmpd_data *) data)->sb; + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->sysname, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval && (failed_writes % 60) == 0) { + ext4_error(sb, "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, "kmmpd being stopped since filesystem " + "has been remounted as readonly."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error(sb, "error reading MMP data: %d", + retval); + + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error(sb, "abort"); + goto failed; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MAX_CHECK_INTERVAL), + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + kfree(data); + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + struct mmpd_data *mmpd_data; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (mmp->mmp_check_interval > mmp_check_interval) + mmp_check_interval = mmp->mmp_check_interval; + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + if (!mmpd_data) { + ext4_warning(sb, "not enough memory for mmpd_data"); + goto failed; + } + mmpd_data->sb = sb; + mmpd_data->bh = bh; + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + bdevname(bh->b_bdev, + mmp->mmp_bdevname)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + kfree(mmpd_data); + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + goto failed; + } + + return 0; + +failed: + brelse(bh); + return 1; +} + + diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b9f3e7862f1..2b8304bf3c5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * It needs to call wait_on_page_writeback() to wait for the * writeback of the page. */ - if (PageWriteback(page)) - wait_on_page_writeback(page); + wait_on_page_writeback(page); /* Release old bh and drop refs */ try_to_release_page(page, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 67fd0b02585..b754b7721f5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); @@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir, handle_t *handle; struct inode *inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir, dquot_initialize(dir); + if (l > EXT4_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks. + */ + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext4_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2263,21 +2292,44 @@ retry: if (IS_ERR(inode)) goto out_stop; - if (l > sizeof(EXT4_I(inode)->i_data)) { + if (l > EXT4_N_BLOCKS * 4) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* - * page_symlink() calls into ext4_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext4_write_begin() which can wait + * for transaction commit if we are running out of space + * and thus we deadlock. So we have to stop transaction now + * and restart it when symlink contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS + * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext4_journal_start(dir, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext4_orphan_del(handle, inode); if (err) { + ext4_journal_stop(handle); clear_nlink(inode); - unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); - iput(inode); - goto out_stop; + goto err_drop_inode; } } else { /* clear the extent format for fast symlink */ @@ -2293,6 +2345,10 @@ out_stop: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext4_link(struct dentry *old_dentry, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b6dbd056fcb..7bb8f76d470 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error) for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; - int partial_write = 0; + loff_t offset; + loff_t io_end_offset; - head = page_buffers(page); - if (error) + if (error) { SetPageError(page); - BUG_ON(!head); - if (head->b_size != PAGE_CACHE_SIZE) { - loff_t offset; - loff_t io_end_offset = io_end->offset + io_end->size; + set_bit(AS_EIO, &page->mapping->flags); + head = page_buffers(page); + BUG_ON(!head); + + io_end_offset = io_end->offset + io_end->size; offset = (sector_t) page->index << PAGE_CACHE_SHIFT; bh = head; do { if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) { - if (error) - buffer_io_error(bh); - - } - if (buffer_delay(bh)) - partial_write = 1; - else if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - else if (buffer_dirty(bh)) - partial_write = 1; + (offset+bh->b_size <= io_end_offset)) + buffer_io_error(bh); + offset += bh->b_size; bh = bh->b_this_page; } while (bh != head); } - /* - * If this is a partial write which happened to make - * all buffers uptodate then we can optimize away a - * bogus readpage() for the next read(). Here we - * 'discover' whether the page went uptodate as a - * result of this (potentially partial) write. - */ - if (!partial_write) - SetPageUptodate(page); - put_io_page(io_end->pages[i]); } io_end->num_io_pages = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8553dfb310a..cc5c157aa11 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -38,6 +38,7 @@ #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> +#include <linux/cleancache.h> #include <asm/uaccess.h> #include <linux/kthread.h> @@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, @@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (!test_opt(sb, INIT_INODE_TABLE)) seq_puts(seq, ",noinit_inode_table"); - else if (sbi->s_li_wait_mult) + else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) seq_printf(seq, ",init_inode_table=%u", (unsigned) sbi->s_li_wait_mult); @@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { -#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, -#endif .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, @@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); + cleancache_init_fs(sb); return res; } @@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } +static ssize_t extent_cache_hits_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); +} + +static ssize_t extent_cache_misses_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); +} + static ssize_t inode_readahead_blks_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RO_ATTR(extent_cache_hits); +EXT4_RO_ATTR(extent_cache_misses); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(extent_cache_hits), + ATTR_LIST(extent_cache_misses), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } -static void ext4_lazyinode_timeout(unsigned long data) -{ - struct task_struct *p = (struct task_struct *)data; - wake_up_process(p); -} - /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { @@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); if (elr->lr_timeout == 0) { - timeout = jiffies - timeout; - if (elr->lr_sbi->s_li_wait_mult) - timeout *= elr->lr_sbi->s_li_wait_mult; - else - timeout *= 20; + timeout = (jiffies - timeout) * + elr->lr_sbi->s_li_wait_mult; elr->lr_timeout = timeout; } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) /* * Remove lr_request from the list_request and free the - * request tructure. Should be called with li_list_mtx held + * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { @@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr) static void ext4_unregister_li_request(struct super_block *sb) { - struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; - - if (!ext4_li_info) + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); return; + } mutex_lock(&ext4_li_info->li_list_mtx); - ext4_remove_li_request(elr); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; @@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; struct list_head *pos, *n; struct ext4_li_request *elr; - unsigned long next_wakeup; - DEFINE_WAIT(wait); + unsigned long next_wakeup, cur; BUG_ON(NULL == eli); - eli->li_timer.data = (unsigned long)current; - eli->li_timer.function = ext4_lazyinode_timeout; - - eli->li_task = current; - wake_up(&eli->li_wait_task); - cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; @@ -2797,19 +2817,15 @@ cont_thread: if (freezing(current)) refrigerator(); - if ((time_after_eq(jiffies, next_wakeup)) || + cur = jiffies; + if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } - eli->li_timer.expires = next_wakeup; - add_timer(&eli->li_timer); - prepare_to_wait(&eli->li_wait_daemon, &wait, - TASK_INTERRUPTIBLE); - if (time_before(jiffies, next_wakeup)) - schedule(); - finish_wait(&eli->li_wait_daemon, &wait); + schedule_timeout_interruptible(next_wakeup - cur); + if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; @@ -2833,12 +2849,7 @@ exit_thread: goto cont_thread; } mutex_unlock(&eli->li_list_mtx); - del_timer_sync(&ext4_li_info->li_timer); - eli->li_task = NULL; - wake_up(&eli->li_wait_task); - kfree(ext4_li_info); - ext4_lazyinit_task = NULL; ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); @@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void) if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); - del_timer_sync(&ext4_li_info->li_timer); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4: error %d creating inode table " @@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void) return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; - - wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL); return 0; } @@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void) if (!eli) return -ENOMEM; - eli->li_task = NULL; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); - init_waitqueue_head(&eli->li_wait_daemon); - init_waitqueue_head(&eli->li_wait_task); - init_timer(&eli->li_timer); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; @@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb, ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; int ret = 0; - if (sbi->s_li_request != NULL) + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; return 0; + } if (first_not_zeroed == ngroups || (sb->s_flags & MS_RDONLY) || - !test_opt(sb, INIT_INODE_TABLE)) { - sbi->s_li_request = NULL; + !test_opt(sb, INIT_INODE_TABLE)) return 0; - } - - if (first_not_zeroed == ngroups) { - sbi->s_li_request = NULL; - return 0; - } elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) @@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { ext4_msg(sb, KERN_WARNING, @@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3; + /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } else { clear_opt(sb, DATA_FLAGS); - set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3707,6 +3742,8 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - int err; + int err = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } enable_quota = 1; } } @@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; u64 fsid; + s64 bfree; if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; @@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = max_t(s64, bfree, 0); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; @@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type) if (test_opt(sb, DELALLOC)) sync_filesystem(sb); + if (!inode) + goto out; + /* Update modification times of quota files when userspace can * start looking at them */ handle = ext4_journal_start(inode, 1); @@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); @@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) @@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + return 0; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } +static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } #endif static struct file_system_type ext4_fs_type = { @@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void) err = init_inodecache(); if (err) goto out1; - register_as_ext2(); register_as_ext3(); + register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b545ca1c459..c757adc9725 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -820,8 +820,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, - goal, NULL, &error); + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); if (error) goto cleanup; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index ae8200f84e3..1cc7038e273 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) spin_unlock(&MSDOS_I(inode)->cache_lru_lock); tmp = fat_cache_alloc(inode); + if (!tmp) { + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + MSDOS_I(inode)->nr_caches--; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + return; + } + spin_lock(&MSDOS_I(inode)->cache_lru_lock); cache = fat_cache_merge(inode, new); if (cache != NULL) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index ee42b9e0b16..4ad64732cbc 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -98,7 +98,7 @@ next: *bh = sb_bread(sb, phys); if (*bh == NULL) { - printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n", + fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed", (llu)phys); /* skip this block */ *pos = (iblock + 1) << sb->s_blocksize_bits; @@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ -static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, - int uni_xlate, struct nls_table *nls) +static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, + const wchar_t *uni, int len, struct nls_table *nls) { + int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate; const wchar_t *ip; wchar_t ec; unsigned char *op; @@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, } if (unlikely(*ip)) { - printk(KERN_WARNING "FAT: filename was truncated while " - "converting."); + fat_msg(sb, KERN_WARNING, "filename was truncated while " + "converting."); } *op = 0; return (op - ascii); } -static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, +static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, unsigned char *buf, int size) { + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sbi->options.utf8) return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, UTF16_HOST_ENDIAN, buf, size); else - return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, - sbi->nls_io); + return uni16_to_x8(sb, buf, uni, size, sbi->nls_io); } static inline int @@ -419,7 +420,7 @@ parse_record: /* Compare shortname */ bufuname[last_u] = 0x0000; - len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); if (fat_name_match(sbi, name, name_len, bufname, len)) goto found; @@ -428,7 +429,7 @@ parse_record: int size = PATH_MAX - FAT_MAX_UNI_SIZE; /* Compare longname */ - len = fat_uni_to_x8(sbi, unicode, longname, size); + len = fat_uni_to_x8(sb, unicode, longname, size); if (fat_name_match(sbi, name, name_len, longname, len)) goto found; } @@ -545,7 +546,7 @@ parse_record: if (nr_slots) { void *longname = unicode + FAT_MAX_UNI_CHARS; int size = PATH_MAX - FAT_MAX_UNI_SIZE; - int len = fat_uni_to_x8(sbi, unicode, longname, size); + int len = fat_uni_to_x8(sb, unicode, longname, size); fill_name = longname; fill_len = len; @@ -621,7 +622,7 @@ parse_record: if (isvfat) { bufuname[j] = 0x0000; - i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); } if (nr_slots) { /* hack for fat_ioctl_filldir() */ @@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) { + struct super_block *sb = dir->i_sb; struct msdos_dir_entry *de; struct buffer_head *bh; int err = 0, nr_slots; @@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) */ err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); if (err) { - printk(KERN_WARNING - "FAT: Couldn't remove the long name slots\n"); + fat_msg(sb, KERN_WARNING, + "Couldn't remove the long name slots"); } } @@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, if (sbi->fat_bits != 32) goto error; } else if (MSDOS_I(dir)->i_start == 0) { - printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n", + fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", MSDOS_I(dir)->i_pos); err = -EIO; goto error; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index f50408901f7..8276cc282de 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, - int isvfat, void (*setup)(struct super_block *)); + int isvfat, void (*setup)(struct super_block *)); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ extern void -__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) +__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(sb, fmt, args...) \ + __fat_fs_error(sb, 1, fmt , ## args) +#define fat_fs_error_ratelimit(sb, fmt, args...) \ + __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))) __cold; -#define fat_fs_error(s, fmt, args...) \ - __fat_fs_error(s, 1, fmt , ## args) -#define fat_fs_error_ratelimit(s, fmt, args...) \ - __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index b47d2c9f4fa..2e81ac0df7e 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr); + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } @@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8d68690bdcf..cb8d8391ac0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->free_clusters; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12; + buf->f_namelen = + (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; return 0; } @@ -619,8 +620,8 @@ retry: bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { - printk(KERN_ERR "FAT: unable to read inode block " - "for updating (i_pos %lld)\n", i_pos); + fat_msg(sb, KERN_ERR, "unable to read inode block " + "for updating (i_pos %lld)", i_pos); return -EIO; } spin_lock(&sbi->inode_hash_lock); @@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int is_vfat, int silent, int *debug, - struct fat_mount_options *opts) +static int parse_options(struct super_block *sb, char *options, int is_vfat, + int silent, int *debug, struct fat_mount_options *opts) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, /* obsolete mount options */ case Opt_obsolate: - printk(KERN_INFO "FAT: \"%s\" option is obsolete, " - "not supported now\n", p); + fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " + "not supported now", p); break; /* unknown option */ default: if (!silent) { - printk(KERN_ERR - "FAT: Unrecognized mount option \"%s\" " - "or missing value\n", p); + fat_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); } return -EINVAL; } @@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { - printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" + fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be " "case sensitive!\n"); } @@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode) /* * Read the super block of an MS-DOS FS. */ -int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, int isvfat, +int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)) { struct inode *root_inode = NULL, *fat_inode = NULL; @@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; - sbi->dir_ops = fs_dir_inode_ops; ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - error = parse_options(data, isvfat, silent, &debug, &sbi->options); + error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); if (error) goto out_fail; @@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { - printk(KERN_ERR "FAT: unable to read boot sector\n"); + fat_msg(sb, KERN_ERR, "unable to read boot sector"); goto out_fail; } b = (struct fat_boot_sector *) bh->b_data; if (!b->reserved) { if (!silent) - printk(KERN_ERR "FAT: bogus number of reserved sectors\n"); + fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); brelse(bh); goto out_invalid; } if (!b->fats) { if (!silent) - printk(KERN_ERR "FAT: bogus number of FAT structure\n"); + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); brelse(bh); goto out_invalid; } @@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, media = b->media; if (!fat_valid_media(media)) { if (!silent) - printk(KERN_ERR "FAT: invalid media value (0x%02x)\n", + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", media); brelse(bh); goto out_invalid; @@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, || (logical_sector_size < 512) || (logical_sector_size > 4096)) { if (!silent) - printk(KERN_ERR "FAT: bogus logical sector size %u\n", + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", logical_sector_size); brelse(bh); goto out_invalid; @@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->sec_per_clus = b->sec_per_clus; if (!is_power_of_2(sbi->sec_per_clus)) { if (!silent) - printk(KERN_ERR "FAT: bogus sectors per cluster %u\n", + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", sbi->sec_per_clus); brelse(bh); goto out_invalid; } if (logical_sector_size < sb->s_blocksize) { - printk(KERN_ERR "FAT: logical sector size too small for device" - " (logical sector size = %u)\n", logical_sector_size); + fat_msg(sb, KERN_ERR, "logical sector size too small for device" + " (logical sector size = %u)", logical_sector_size); brelse(bh); goto out_fail; } @@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, brelse(bh); if (!sb_set_blocksize(sb, logical_sector_size)) { - printk(KERN_ERR "FAT: unable to set blocksize %u\n", + fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } bh = sb_bread(sb, 0); if (bh == NULL) { - printk(KERN_ERR "FAT: unable to read boot sector" - " (logical sector size = %lu)\n", + fat_msg(sb, KERN_ERR, "unable to read boot sector" + " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } @@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); if (fsinfo_bh == NULL) { - printk(KERN_ERR "FAT: bread failed, FSINFO block" - " (sector = %lu)\n", sbi->fsinfo_sector); + fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" + " (sector = %lu)", sbi->fsinfo_sector); brelse(bh); goto out_fail; } fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; if (!IS_FSINFO(fsinfo)) { - printk(KERN_WARNING "FAT: Invalid FSINFO signature: " - "0x%08x, 0x%08x (sector = %lu)\n", + fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); @@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->dir_entries = get_unaligned_le16(&b->dir_entries); if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) - printk(KERN_ERR "FAT: bogus directroy-entries per block" - " (%u)\n", sbi->dir_entries); + fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" + " (%u)", sbi->dir_entries); brelse(bh); goto out_invalid; } @@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) - printk(KERN_ERR "FAT: count of clusters too big (%u)\n", + fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); brelse(bh); goto out_invalid; @@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sprintf(buf, "cp%d", sbi->options.codepage); sbi->nls_disk = load_nls(buf); if (!sbi->nls_disk) { - printk(KERN_ERR "FAT: codepage %s not found\n", buf); + fat_msg(sb, KERN_ERR, "codepage %s not found", buf); goto out_fail; } @@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, if (sbi->options.isvfat) { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { - printk(KERN_ERR "FAT: IO charset %s not found\n", + fat_msg(sb, KERN_ERR, "IO charset %s not found", sbi->options.iocharset); goto out_fail; } @@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, insert_inode_hash(root_inode); sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) { - printk(KERN_ERR "FAT: get root inode failed\n"); + fat_msg(sb, KERN_ERR, "get root inode failed"); goto out_fail; } @@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, out_invalid: error = -EINVAL; if (!silent) - printk(KERN_INFO "VFS: Can't find a valid FAT filesystem" - " on dev %s.\n", sb->s_id); + fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: if (fat_inode) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 970e682ea75..6d93360ca0c 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -20,30 +20,46 @@ * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ -void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) { - struct fat_mount_options *opts = &MSDOS_SB(s)->options; + struct fat_mount_options *opts = &MSDOS_SB(sb)->options; va_list args; + struct va_format vaf; if (report) { - printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); - - printk(KERN_ERR " "); va_start(args, fmt); - vprintk(fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); va_end(args); - printk("\n"); } if (opts->errors == FAT_ERRORS_PANIC) - panic("FAT: fs panic from previous error\n"); - else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { - s->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); + panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); + else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { + sb->s_flags |= MS_RDONLY; + printk(KERN_ERR "FAT-fs (%s): Filesystem has been " + "set read-only\n", sb->s_id); } } EXPORT_SYMBOL_GPL(__fat_fs_error); +/** + * fat_msg() - print preformated FAT specific messages. Every thing what is + * not fat_fs_error() should be fat_msg(). + */ +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ int fat_clusters_flush(struct super_block *sb) @@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb) bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { - printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); + fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; /* Sanity check */ if (!IS_FSINFO(fsinfo)) { - printk(KERN_ERR "FAT: Invalid FSINFO signature: " - "0x%08x, 0x%08x (sector = %lu)\n", + fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 711499040eb..be15437c272 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; + dentry_unhash(dentry); + lock_super(sb); /* * Check whether the directory is not in use, then check @@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { err = -EIO; @@ -659,14 +664,14 @@ static const struct inode_operations msdos_dir_inode_operations = { static void setup(struct super_block *sb) { + MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; sb->s_d_op = &msdos_dentry_operations; sb->s_flags |= MS_NOATIME; } static int msdos_fill_super(struct super_block *sb, void *data, int silent) { - return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, - 0, setup); + return fat_fill_super(sb, data, silent, 0, setup); } static struct dentry *msdos_mount(struct file_system_type *fs_type, diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index adae3fb7451..c61a6789f36 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) struct fat_slot_info sinfo; int err; + dentry_unhash(dentry); + lock_super(sb); err = fat_dir_empty(inode); @@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; @@ -1065,6 +1070,7 @@ static const struct inode_operations vfat_dir_inode_operations = { static void setup(struct super_block *sb) { + MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations; if (MSDOS_SB(sb)->options.name_check != 's') sb->s_d_op = &vfat_ci_dentry_ops; else @@ -1073,8 +1079,7 @@ static void setup(struct super_block *sb) static int vfat_fill_super(struct super_block *sb, void *data, int silent) { - return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, - 1, setup); + return fat_fill_super(sb, data, silent, 1, setup); } static struct dentry *vfat_mount(struct file_system_type *fs_type, diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 2ba6719ac61..1a4311437a8 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -272,7 +272,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) * *ip: VFS inode * * Description: - * vxfs_put_fake_inode frees all data asssociated with @ip. + * vxfs_put_fake_inode frees all data associated with @ip. */ void vxfs_put_fake_inode(struct inode *ip) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 48a18f184d5..30afdfa7aec 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op) _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - fscache_set_op_state(op, "EnQ"); - ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); @@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation); static void fscache_run_op(struct fscache_object *object, struct fscache_operation *op) { - fscache_set_op_state(op, "Run"); - object->n_in_progress++; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); - fscache_set_op_state(op, "SubmitX"); - spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object, ASSERTCMP(atomic_read(&op->usage), >, 0); - fscache_set_op_state(op, "Submit"); - spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; - fscache_set_op_state(op, "Put"); - _debug("PUT OP"); if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) BUG(); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 41c441c2058..a2a5d19ece6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); if (fscache_object_is_active(object)) { - fscache_set_op_state(op, "CallFS"); fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); - fscache_set_op_state(op, "Done"); if (ret < 0) fscache_abort_object(object); } @@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) fscache_operation_init(op, fscache_attr_changed_op, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); - fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); - fscache_set_op_name(&op->op, "Retr"); return op; } @@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, _leave(" = -ENOMEM"); return -ENOMEM; } - fscache_set_op_name(&op->op, "RetrRA1"); spin_lock(&cookie->lock); @@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(mapping, end_io_func, context); if (!op) return -ENOMEM; - fscache_set_op_name(&op->op, "RetrRAN"); spin_lock(&cookie->lock); @@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(page->mapping, NULL, NULL); if (!op) return -ENOMEM; - fscache_set_op_name(&op->op, "RetrAL1"); spin_lock(&cookie->lock); @@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); - fscache_set_op_state(&op->op, "GetPage"); - spin_lock(&object->lock); cookie = object->cookie; @@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op) spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - fscache_set_op_state(&op->op, "Store"); fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); - fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(object, page); if (ret < 0) { - fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); } else { fscache_enqueue_operation(&op->op); @@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_write_op, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); - fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b32eb29a4e6..0d0e3faddcf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (IS_ERR(req)) return PTR_ERR(req); + dentry_unhash(entry); + req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); req->in.numargs = 1; @@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); struct fuse_req *req = fuse_get_req(fc); + + if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode)) + dentry_unhash(newent); + if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 74add2ddcc3..e65493a8ac0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -780,6 +780,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, metadata = (height != ip->i_height - 1); if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + else if (ip->i_depth) + revokes = sdp->sd_inptrs; if (ip != GFS2_I(sdp->sd_rindex)) error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a2a6abbccc0..2792a790e50 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1346,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) { struct gfs2_glock *gl; int may_demote; int nr_skipped = 0; + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; LIST_HEAD(skipped); if (nr == 0) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index cec26c00b50..903115f2bb3 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -228,6 +228,27 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) return ret; } +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { + list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} /** * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced @@ -878,9 +899,9 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL); for (;;) { gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); if (gfs2_ail1_empty(sdp)) break; - msleep(10); } } @@ -920,12 +941,14 @@ int gfs2_logd(void *data) if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); - io_schedule(); + gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL); } - wake_up(&sdp->sd_log_waitq); + if (!gfs2_ail_flush_reqd(sdp)) + wake_up(&sdp->sd_log_waitq); + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; if (freezing(current)) refrigerator(); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index cfa327d3319..c2b34cd2abe 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -146,7 +146,7 @@ static int __init init_gfs2_fs(void) gfs2_register_debugfs(); - printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); + printk("GFS2 installed\n"); return 0; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e23d9864c41..42e8d23bc04 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -38,6 +38,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/mm.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> @@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; + int nr_to_scan = sc->nr_to_scan; - if (nr == 0) + if (nr_to_scan == 0) goto out; - if (!(gfp_mask & __GFP_FS)) + if (!(sc->gfp_mask & __GFP_FS)) return -1; spin_lock(&qd_lru_lock); - while (nr && !list_empty(&qd_lru_list)) { + while (nr_to_scan && !list_empty(&qd_lru_list)) { qd = list_entry(qd_lru_list.next, struct gfs2_quota_data, qd_reclaim); sdp = qd->qd_gl->gl_sbd; @@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) spin_unlock(&qd_lru_lock); kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&qd_lru_lock); - nr--; + nr_to_scan--; } spin_unlock(&qd_lru_lock); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e7d236ca48b..90bf1c302a9 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -12,6 +12,7 @@ struct gfs2_inode; struct gfs2_sbd; +struct shrink_control; #define NO_QUOTA_CHANGE ((u32)-1) @@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, + struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7273ad3c85b..9b780df3fd5 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1629,6 +1629,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); + + /* Directories keep their data in the metadata address space */ + if (ip->i_depth) + gfs2_meta_wipe(ip, bstart, blen); } /** diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b4d70b13be9..1cb70cdba2c 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; + if (S_ISDIR(inode->i_mode)) + dentry_unhash(dentry); + if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); @@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { + if (S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + res = hfs_remove(new_dir, new_dentry); if (res) return res; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4df5059c25d..b28835091dd 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int res; + dentry_unhash(dentry); + if (inode->i_size != 2) return -ENOTEMPTY; @@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (S_ISDIR(new_dentry->d_inode->i_mode)) { + dentry_unhash(new_dentry); res = hfsplus_rmdir(new_dir, new_dentry); - else + } else { res = hfsplus_unlink(new_dir, new_dentry); + } if (res) return res; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2638c834ed2..e6816b9e690 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry) char *file; int err; + dentry_unhash(dentry); + if ((file = dentry_name(dentry)) == NULL) return -ENOMEM; err = do_rmdir(file); @@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, char *from_name, *to_name; int err; + if (to->d_inode && S_ISDIR(to->d_inode->i_mode)) + dentry_unhash(to); + if ((from_name = dentry_name(from)) == NULL) return -ENOMEM; if ((to_name = dentry_name(to)) == NULL) { diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 1f05839c27a..ff0ce21c086 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -395,7 +395,6 @@ again: dentry_unhash(dentry); if (!d_unhashed(dentry)) { - dput(dentry); hpfs_unlock(dir->i_sb); return -ENOSPC; } @@ -403,7 +402,6 @@ again: !S_ISREG(inode->i_mode) || get_write_access(inode)) { d_rehash(dentry); - dput(dentry); } else { struct iattr newattrs; /*printk("HPFS: truncating file before delete.\n");*/ @@ -411,7 +409,6 @@ again: newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; err = notify_change(dentry, &newattrs); put_write_access(inode); - dput(dentry); if (!err) goto again; } @@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) int err; int r; + dentry_unhash(dentry); + hpfs_adjust_length(name, &len); hpfs_lock(dir->i_sb); err = -ENOENT; @@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh; struct fnode *fnode; int err; + + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + if ((err = hpfs_chk_name(new_name, &new_len))) return err; err = 0; hpfs_adjust_length(old_name, &old_len); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9eeb1cd03f..7aafeb8fa30 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); return 0; } @@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags) { int error = -ENOMEM; diff --git a/fs/inode.c b/fs/inode.c index 05f4fa52132..990d284877a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -326,12 +326,11 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); + mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); - mutex_init(&mapping->unmap_mutex); } EXPORT_SYMBOL(address_space_init_once); @@ -752,8 +751,12 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 69b18045946..72ffa974b0b 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); @@ -722,8 +716,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b3713afaaa9..e2d4285fbe9 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal) int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d2319651b..f7ee81a065d 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6e28000a4b2..7f21cf3aaf9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - commit_transaction->t_flushed_data_blocks = 1; clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); smp_mb__after_clear_bit(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); @@ -338,12 +337,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); @@ -678,12 +671,16 @@ start_journal_io: err = 0; } + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_DFLUSH; + write_unlock(&journal->j_state_lock); /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue * the commit record */ - if (commit_transaction->t_flushed_data_blocks && + if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); @@ -760,8 +757,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this IO to + * complete. The barrier must be here so that changes by + * jbd2_journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); @@ -800,6 +802,10 @@ wait_for_iobuf: jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); + write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); + commit_transaction->t_state = T_COMMIT_JFLUSH; + write_unlock(&journal->j_state_lock); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -955,7 +961,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 7\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); commit_transaction->t_start = jiffies; stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e0ec3db1c39..9a782699030 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal) int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, + journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } @@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) } /* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JBD2_BARRIER)) + return 0; + read_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + commit_trans = journal->j_committing_transaction; + if (!commit_trans || commit_trans->t_tid != tid) { + ret = 1; + goto out; + } + /* + * Transaction is being committed and we already proceeded to + * submitting a flush to fs partition? + */ + if (journal->j_fs_dev != journal->j_dev) { + if (!commit_trans->t_need_data_flush || + commit_trans->t_state >= T_COMMIT_DFLUSH) + goto out; + } else { + if (commit_trans->t_state >= T_COMMIT_JFLUSH) + goto out; + } + ret = 1; +out: + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); + +/* * Wait for a specified commit to complete. * The caller may not hold the journal lock. */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 05fa77a2371..3eec82d32fd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) */ /* - * Update transiaction's maximum wait time, if debugging is enabled. + * Update transaction's maximum wait time, if debugging is enabled. * * In order for t_max_wait to be reliable, it must be protected by a * lock. But doing so will mean that start_this_handle() can not be @@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * means that maximum wait time reported by the jbd2_run_stats * tracepoint will always be zero. */ -static inline void update_t_max_wait(transaction_t *transaction) +static inline void update_t_max_wait(transaction_t *transaction, + unsigned long ts) { #ifdef CONFIG_JBD2_DEBUG - unsigned long ts = jiffies; - if (jbd2_journal_enable_debug && time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); @@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, tid_t tid; int needed, need_to_start; int nblocks = handle->h_buffer_credits; + unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -271,7 +271,7 @@ repeat: /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ - update_t_max_wait(transaction); + update_t_max_wait(transaction, ts); handle->h_transaction = transaction; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { @@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); jbd2_journal_cancel_revoke(handle, jh); - jbd2_journal_put_journal_head(jh); out: + jbd2_journal_put_journal_head(jh); return err; } @@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) jinode->i_next_transaction == transaction) goto done; + /* + * We only ever set this variable to 1 so the test is safe. Since + * t_need_data_flush is likely to be set, we do the test to save some + * cacheline bouncing + */ + if (!transaction->t_need_data_flush) + transaction->t_need_data_flush = 1; /* On some different transaction's list - should be * the committing one */ if (jinode->i_transaction) { diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 82faddd1f32..05f73328b28 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) int ret; uint32_t now = get_seconds(); + dentry_unhash(dentry); + for (fd = f->dents ; fd; fd = fd->next) { if (fd->ino) return -ENOTEMPTY; @@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, uint8_t type; uint32_t now; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + /* The VFS will check for us and prevent trying to rename a * file over a directory and vice versa, but if it's a directory, * the VFS can't check whether the victim is empty. The filesystem diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index eaaf2b511e8..865df16a6cf 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); + dentry_unhash(dentry); + /* Init inode for quota operations. */ dquot_initialize(dip); dquot_initialize(ip); @@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + dquot_initialize(old_dir); dquot_initialize(new_dir); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1663f..f34c9cde9e9 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + dentry_unhash(dentry); + if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, loff_t pos; int err; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + /* 1. locate source dd */ err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); if (err) diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 9e22085231b..d8d09380c7d 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb, val = inode_val0(inode); break; case INODE_USED_OFS: - val = cpu_to_be64(li->li_used_bytes);; + val = cpu_to_be64(li->li_used_bytes); break; case INODE_SIZE_OFS: val = cpu_to_be64(i_size_read(inode)); diff --git a/fs/mbcache.c b/fs/mbcache.c index 2f174be0655..8c32ef3ba88 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock); * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, + struct shrink_control *sc); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -156,18 +157,19 @@ forget: * gets low. * * @shrink: (ignored) - * @nr_to_scan: Number of objects to scan - * @gfp_mask: (ignored) + * @sc: shrink_control passed from reclaim * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(free_list); struct mb_cache *cache; struct mb_cache_entry *entry, *tmp; int count = 0; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 6e6777f1b4b..f60aed8db9c 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err = -ENOTEMPTY; + dentry_unhash(dentry); + if (minix_empty_dir(inode)) { err = minix_unlink(dir, dentry); if (!err) { @@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct minix_dir_entry * old_de; int err = -ENOENT; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/fs/mpage.c b/fs/mpage.c index 0afc809e46e..fdfae9fa98c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <linux/cleancache.h> /* * I/O completion handler for multipage BIOs. @@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, SetPageMappedToDisk(page); } + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/namei.c b/fs/namei.c index 6ff858c049c..2358b326b22 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -391,79 +391,28 @@ void path_put(struct path *path) } EXPORT_SYMBOL(path_put); -/** - * nameidata_drop_rcu - drop this nameidata out of rcu-walk - * @nd: nameidata pathwalk data to drop - * Returns: 0 on success, -ECHILD on failure - * +/* * Path walking has 2 modes, rcu-walk and ref-walk (see - * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt - * to drop out of rcu-walk mode and take normal reference counts on dentries - * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take - * refcounts at the last known good point before rcu-walk got stuck, so - * ref-walk may continue from there. If this is not successful (eg. a seqcount - * has changed), then failure is returned and path walk restarts from the - * beginning in ref-walk mode. - * - * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into - * ref-walk. Must be called from rcu-walk context. + * Documentation/filesystems/path-lookup.txt). In situations when we can't + * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab + * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * mode. Refcounts are grabbed at the last known good point before rcu-walk + * got stuck, so ref-walk may continue from there. If this is not successful + * (eg. a seqcount has changed), then failure is returned and it's up to caller + * to restart the path walk from the beginning in ref-walk mode. */ -static int nameidata_drop_rcu(struct nameidata *nd) -{ - struct fs_struct *fs = current->fs; - struct dentry *dentry = nd->path.dentry; - int want_root = 0; - - BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; - } - spin_lock(&dentry->d_lock); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err; - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - if (want_root) { - path_get(&nd->root); - spin_unlock(&fs->lock); - } - mntget(nd->path.mnt); - - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - nd->flags &= ~LOOKUP_RCU; - return 0; -err: - spin_unlock(&dentry->d_lock); -err_root: - if (want_root) - spin_unlock(&fs->lock); - return -ECHILD; -} - -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_maybe(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) - return nameidata_drop_rcu(nd); - return 0; -} /** - * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk - * @nd: nameidata pathwalk data to drop - * @dentry: dentry to drop + * unlazy_walk - try to switch to ref-walk mode. + * @nd: nameidata pathwalk data + * @dentry: child of nd->path.dentry or NULL * Returns: 0 on success, -ECHILD on failure * - * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, - * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on - * @nd. Must be called from rcu-walk context. + * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry + * for ref-walk mode. @dentry must be a path found by a do_lookup call on + * @nd or NULL. Must be called from rcu-walk context. */ -static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; @@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry goto err_root; } spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err; - /* - * If the sequence check on the child dentry passed, then the child has - * not been removed from its parent. This means the parent dentry must - * be valid and able to take a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); + if (!dentry) { + if (!__d_rcu_to_refcount(parent, nd->seq)) + goto err_parent; + BUG_ON(nd->inode != parent->d_inode); + } else { + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (!__d_rcu_to_refcount(dentry, nd->seq)) + goto err_child; + /* + * If the sequence check on the child dentry passed, then + * the child has not been removed from its parent. This + * means the parent dentry must be valid and able to take + * a reference at this point. + */ + BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); + BUG_ON(!parent->d_count); + parent->d_count++; + spin_unlock(&dentry->d_lock); + } spin_unlock(&parent->d_lock); if (want_root) { path_get(&nd->root); @@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry br_read_unlock(vfsmount_lock); nd->flags &= ~LOOKUP_RCU; return 0; -err: + +err_child: spin_unlock(&dentry->d_lock); +err_parent: spin_unlock(&parent->d_lock); err_root: if (want_root) @@ -510,59 +468,6 @@ err_root: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) -{ - if (nd->flags & LOOKUP_RCU) { - if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - return -ECHILD; - } - } - return 0; -} - -/** - * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk - * @nd: nameidata pathwalk data to drop - * Returns: 0 on success, -ECHILD on failure - * - * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk. - * nd->path should be the final element of the lookup, so nd->root is discarded. - * Must be called from rcu-walk context. - */ -static int nameidata_drop_rcu_last(struct nameidata *nd) -{ - struct dentry *dentry = nd->path.dentry; - - BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_unlock; - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - - mntget(nd->path.mnt); - - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - - return 0; - -err_unlock: - spin_unlock(&dentry->d_lock); - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - return -ECHILD; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -/* - * handle_reval_path - force revalidation of a dentry - * - * In some situations the path walking code will trust dentries without - * revalidating them. This causes problems for filesystems that depend on - * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set - * (which indicates that it's possible for the dentry to go stale), force - * a d_revalidate call before proceeding. +/** + * complete_walk - successful completion of path walk + * @nd: pointer nameidata * - * Returns 0 if the revalidation was successful. If the revalidation fails, - * either return the error returned by d_revalidate or -ESTALE if the - * revalidation it just returned 0. If d_revalidate returns 0, we attempt to - * invalidate the dentry. It's up to the caller to handle putting references - * to the path if necessary. + * If we had been in RCU mode, drop out of it and legitimize nd->path. + * Revalidate the final result, unless we'd already done that during + * the path walk or the filesystem doesn't ask for it. Return 0 on + * success, -error on failure. In case of failure caller does not + * need to drop nd->path. */ -static inline int handle_reval_path(struct nameidata *nd) +static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; + if (nd->flags & LOOKUP_RCU) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + spin_lock(&dentry->d_lock); + if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + BUG_ON(nd->inode != dentry->d_inode); + spin_unlock(&dentry->d_lock); + mntget(nd->path.mnt); + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + } + if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; @@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd) if (!status) status = -ESTALE; + path_put(&nd->path); return status; } @@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; unlazy: - if (dentry) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return -ECHILD; - } else { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - } + if (unlazy_walk(nd, dentry)) + return -ECHILD; } else { dentry = __d_lookup(parent, name); } @@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd) int err = exec_permission(nd->inode, IPERM_FLAG_RCU); if (err != -ECHILD) return err; - if (nameidata_drop_rcu(nd)) + if (unlazy_walk(nd, NULL)) return -ECHILD; } return exec_permission(nd->inode, 0); @@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path, return -ENOENT; } if (unlikely(inode->i_op->follow_link) && follow) { - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return -ECHILD; + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, path->dentry))) { + terminate_walk(nd); + return -ECHILD; + } + } BUG_ON(inode != path->dentry->d_inode); return 1; } @@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name, } } - if (nd->flags & LOOKUP_RCU) { - /* went all way through without dropping RCU */ - BUG_ON(err); - if (nameidata_drop_rcu_last(nd)) - err = -ECHILD; - } - - if (!err) { - err = handle_reval_path(nd); - if (err) - path_put(&nd->path); - } + if (!err) + err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { if (!nd->inode->i_op->lookup) { @@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - if (nd->flags & LOOKUP_RCU) { - if (nameidata_drop_rcu_last(nd)) - return ERR_PTR(-ECHILD); - } - error = handle_reval_path(nd); + error = complete_walk(nd); if (error) - goto exit; + return ERR_PTR(error); audit_inode(pathname, nd->path.dentry); if (open_flag & O_CREAT) { error = -EISDIR; @@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } goto ok; case LAST_BIND: - /* can't be RCU mode here */ - error = handle_reval_path(nd); + error = complete_walk(nd); if (error) - goto exit; + return ERR_PTR(error); audit_inode(pathname, dir); goto ok; } @@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) /* symlink */ return NULL; /* sayonara */ - if (nd->flags & LOOKUP_RCU) { - if (nameidata_drop_rcu_last(nd)) - return ERR_PTR(-ECHILD); - } + error = complete_walk(nd); + if (error) + return ERR_PTR(-ECHILD); error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { @@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ - - if (nd->flags & LOOKUP_RCU) { - if (nameidata_drop_rcu_last(nd)) - return ERR_PTR(-ECHILD); - } + error = complete_walk(nd); + if (error) + return ERR_PTR(error); audit_inode(pathname, dir); error = -EISDIR; @@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) } /* - * We try to drop the dentry early: we should have - * a usage count of 2 if we're the only user of this - * dentry, and if that is true (possibly after pruning - * the dcache), then we drop the dentry now. + * The dentry_unhash() helper will try to drop the dentry early: we + * should have a usage count of 2 if we're the only user of this + * dentry, and if that is true (possibly after pruning the dcache), + * then we drop the dentry now. * * A low-level filesystem can, if it choses, legally * do a @@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) */ void dentry_unhash(struct dentry *dentry) { - dget(dentry); shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 2) + if (dentry->d_count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } @@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; mutex_lock(&dentry->d_inode->i_mutex); - dentry_unhash(dentry); + + error = -EBUSY; if (d_mountpoint(dentry)) - error = -EBUSY; - else { - error = security_inode_rmdir(dir, dentry); - if (!error) { - error = dir->i_op->rmdir(dir, dentry); - if (!error) { - dentry->d_inode->i_flags |= S_DEAD; - dont_mount(dentry); - } - } - } + goto out; + + error = security_inode_rmdir(dir, dentry); + if (error) + goto out; + + error = dir->i_op->rmdir(dir, dentry); + if (error) + goto out; + + dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + +out: mutex_unlock(&dentry->d_inode->i_mutex); - if (!error) { + if (!error) d_delete(dentry); - } - dput(dentry); - return error; } @@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. - * d) some filesystems don't support opened-but-unlinked directories, - * either because of layout or because they are not ready to deal with - * all cases correctly. The latter will be fixed (taking this sort of - * stuff into VFS), but the former is not going away. Solution: the same - * trick as in rmdir(). - * e) conversion from fhandle to dentry may come in the wrong moment - when + * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive @@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { int error = 0; - struct inode *target; + struct inode *target = new_dentry->d_inode; /* * If we are going to change the parent - check write permissions, @@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - target = new_dentry->d_inode; if (target) mutex_lock(&target->i_mutex); - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) - error = -EBUSY; - else { - if (target) - dentry_unhash(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - } + + error = -EBUSY; + if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) + goto out; + + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + goto out; + if (target) { - if (!error) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); - } - mutex_unlock(&target->i_mutex); - if (d_unhashed(new_dentry)) - d_rehash(new_dentry); - dput(new_dentry); + target->i_flags |= S_DEAD; + dont_mount(new_dentry); } +out: + if (target) + mutex_unlock(&target->i_mutex); if (!error) if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry,new_dentry); @@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *target; + struct inode *target = new_dentry->d_inode; int error; error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); @@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, return error; dget(new_dentry); - target = new_dentry->d_inode; if (target) mutex_lock(&target->i_mutex); + + error = -EBUSY; if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (!error) { - if (target) - dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); - } + goto out; + + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + goto out; + + if (target) + dont_mount(new_dentry); + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry, new_dentry); +out: if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); diff --git a/fs/namespace.c b/fs/namespace.c index d99bcf59e4c..fe59bd145d2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) static int flags_to_propagation_type(int flags) { - int type = flags & ~MS_REC; + int type = flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f6946bb5cb5..e3e646b0640 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); + dentry_unhash(dentry); + error = -EBUSY; if (!d_unhashed(dentry)) goto out; @@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name); + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 0250e4ce489..202f370526a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) #endif struct ncp_entry_info finfo; - data.wdog_pid = NULL; + memset(&data, 0, sizeof(data)); server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); if (!server) return -ENOMEM; @@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; data.flags = md->flags; - data.int_flags = 0; data.mounted_uid = md->mounted_uid; data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; @@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; - data.mounted_vol[0] = 0; } break; default: diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index a7c07b44b10..e5d71b27a5b 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -16,6 +16,7 @@ #include <linux/mman.h> #include <linux/string.h> #include <linux/fcntl.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, * -- wli */ count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7237672216c..424e47773a8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return (nr_to_scan == 0) ? 0 : -1; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ce118ce885d..2df6ca7b589 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp, /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, - int nr_to_scan, gfp_t gfp_mask); + struct shrink_control *sc); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 5232d3e8fb2..a2e2402b2af 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -8,7 +8,7 @@ * Statistsics for the reply cache * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache> * statistics for filehandle lookup - * io <bytes-read> <bytes-writtten> + * io <bytes-read> <bytes-written> * statistics for IO throughput * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> * time (seconds) when nfsd thread usage above thresholds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index f7684483785..eed4d7b2624 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -489,8 +489,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { - nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); - nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); @@ -527,8 +527,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); - nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); - nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); @@ -683,8 +683,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) kunmap(bitmap_bh->b_page); kunmap(desc_bh->b_page); - nilfs_mdt_mark_buffer_dirty(desc_bh); - nilfs_mdt_mark_buffer_dirty(bitmap_bh); + mark_buffer_dirty(desc_bh); + mark_buffer_dirty(bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(bitmap_bh); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 4723f04e9b1..aadbd0b5e3e 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -34,7 +34,9 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) { - return NILFS_I_NILFS(bmap->b_inode)->ns_dat; + struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info; + + return nilfs->ns_dat; } static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 609cd223eea..a35ae35e693 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -34,12 +34,6 @@ #include "page.h" #include "btnode.h" -void nilfs_btnode_cache_init(struct address_space *btnc, - struct backing_dev_info *bdi) -{ - nilfs_mapping_init(btnc, bdi); -} - void nilfs_btnode_cache_clear(struct address_space *btnc) { invalidate_mapping_pages(btnc, 0, -1); @@ -62,7 +56,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) BUG(); } memset(bh->b_data, 0, 1 << inode->i_blkbits); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -94,10 +88,11 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { - struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; /* blocknr is a virtual block number */ - err = nilfs_dat_translate(dat, blocknr, &pblocknr); + err = nilfs_dat_translate(nilfs->ns_dat, blocknr, + &pblocknr); if (unlikely(err)) { brelse(bh); goto out_locked; @@ -120,7 +115,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -259,7 +254,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); - nilfs_btnode_mark_dirty(obh); + mark_buffer_dirty(obh); spin_lock_irq(&btnc->tree_lock); radix_tree_delete(&btnc->page_tree, oldkey); @@ -271,7 +266,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, unlock_page(opage); } else { nilfs_copy_buffer(nbh, obh); - nilfs_btnode_mark_dirty(nbh); + mark_buffer_dirty(nbh); nbh->b_blocknr = newkey; ctxt->bh = nbh; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 1b8ebd888c2..3a4dd2d8d3f 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); @@ -51,7 +50,4 @@ void nilfs_btnode_commit_change_key(struct address_space *, void nilfs_btnode_abort_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); -#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh) - - #endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index d451ae0e0bf..7eafe468a29 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -714,7 +714,7 @@ static void nilfs_btree_promote_key(struct nilfs_bmap *btree, nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); } while ((path[level].bp_index == 0) && (++level < nilfs_btree_height(btree) - 1)); } @@ -739,7 +739,7 @@ static void nilfs_btree_do_insert(struct nilfs_bmap *btree, nilfs_btree_node_insert(node, path[level].bp_index, *keyp, *ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, @@ -777,9 +777,9 @@ static void nilfs_btree_carry_left(struct nilfs_bmap *btree, nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -823,9 +823,9 @@ static void nilfs_btree_carry_right(struct nilfs_bmap *btree, nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, @@ -870,9 +870,9 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; @@ -919,7 +919,7 @@ static void nilfs_btree_grow(struct nilfs_bmap *btree, nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1194,7 +1194,7 @@ static void nilfs_btree_do_delete(struct nilfs_bmap *btree, nilfs_btree_node_delete(node, path[level].bp_index, keyp, ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1226,9 +1226,9 @@ static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1258,9 +1258,9 @@ static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, @@ -1289,7 +1289,7 @@ static void nilfs_btree_concat_left(struct nilfs_bmap *btree, nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; @@ -1315,7 +1315,7 @@ static void nilfs_btree_concat_right(struct nilfs_bmap *btree, nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1709,7 +1709,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); if (!buffer_dirty(bh)) - nilfs_btnode_mark_dirty(bh); + mark_buffer_dirty(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); @@ -1787,7 +1787,7 @@ static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, { while ((++level < nilfs_btree_height(btree) - 1) && !buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); return 0; } @@ -2229,7 +2229,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) } if (!buffer_dirty(bh)) - nilfs_btnode_mark_dirty(bh); + mark_buffer_dirty(bh); brelse(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 5ff15a8a102..c9b342c8b50 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -216,14 +216,14 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile, if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, kaddr, 1); - nilfs_mdt_mark_buffer_dirty(cp_bh); + mark_buffer_dirty(cp_bh); kaddr = kmap_atomic(header_bh->b_page, KM_USER0); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, 1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); } @@ -326,7 +326,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, } if (nicps > 0) { tnicps += nicps; - nilfs_mdt_mark_buffer_dirty(cp_bh); + mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); if (!nilfs_cpfile_is_in_first(cpfile, cno)) { count = @@ -358,7 +358,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); kunmap_atomic(kaddr, KM_USER0); } @@ -671,10 +671,10 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) le64_add_cpu(&header->ch_nsnapshots, 1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(prev_bh); - nilfs_mdt_mark_buffer_dirty(curr_bh); - nilfs_mdt_mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(curr_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); @@ -774,10 +774,10 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) le64_add_cpu(&header->ch_nsnapshots, -1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(next_bh); - nilfs_mdt_mark_buffer_dirty(prev_bh); - nilfs_mdt_mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(next_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 59e5fe742f7..fcc2f869af1 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -54,7 +54,7 @@ static int nilfs_dat_prepare_entry(struct inode *dat, static void nilfs_dat_commit_entry(struct inode *dat, struct nilfs_palloc_req *req) { - nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh); + mark_buffer_dirty(req->pr_entry_bh); nilfs_mdt_mark_dirty(dat); brelse(req->pr_entry_bh); } @@ -361,7 +361,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) entry->de_blocknr = cpu_to_le64(blocknr); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(entry_bh); + mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); brelse(entry_bh); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 397e7325863..d7eeca62feb 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -111,7 +111,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - SetPageChecked(page); wait_on_page_writeback(page); return VM_FAULT_LOCKED; } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1c2a3e23f8b..08a07a218d2 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -48,9 +48,6 @@ #include "dat.h" #include "ifile.h" -static const struct address_space_operations def_gcinode_aops = { -}; - /* * nilfs_gccache_submit_read_data() - add data buffer and submit read request * @inode - gc inode @@ -87,9 +84,9 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; if (pbn == 0) { - struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat; - /* use original dat, not gc dat. */ - err = nilfs_dat_translate(dat_inode, vbn, &pbn); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ brelse(bh); goto failed; @@ -103,7 +100,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, } if (!buffer_mapped(bh)) { - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; set_buffer_mapped(bh); } bh->b_blocknr = pbn; @@ -160,15 +157,11 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) if (buffer_dirty(bh)) return -EEXIST; - if (buffer_nilfs_node(bh)) { - if (nilfs_btree_broken_node_block(bh)) { - clear_buffer_uptodate(bh); - return -EIO; - } - nilfs_btnode_mark_dirty(bh); - } else { - nilfs_mark_buffer_dirty(bh); + if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; } + mark_buffer_dirty(bh); return 0; } @@ -178,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &def_gcinode_aops; + inode->i_mapping->a_ops = &empty_aops; inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; ii->i_flags = 0; diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index bfc73d3a30e..684d76300a8 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -80,7 +80,7 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, return ret; } nilfs_palloc_commit_alloc_entry(ifile, &req); - nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + mark_buffer_dirty(req.pr_entry_bh); nilfs_mdt_mark_dirty(ifile); *out_ino = (ino_t)req.pr_entry_nr; *out_bh = req.pr_entry_bh; @@ -128,7 +128,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) raw_inode->i_flags = 0; kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); nilfs_palloc_commit_free_entry(ifile, &req); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c0aa27490c0..587f1843283 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -74,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct buffer_head *bh_result, int create) { struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 blknum = 0; int err = 0, ret; - struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; - down_read(&NILFS_MDT(dat)->mi_sem); + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); - up_read(&NILFS_MDT(dat)->mi_sem); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); if (ret >= 0) { /* found */ map_bh(bh_result, inode->i_sb, blknum); if (ret > 0) @@ -596,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_flags = cpu_to_le32(ii->i_flags); raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* zero-fill unused portion in the case of super root block */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + } + if (has_bmap) nilfs_bmap_write(ii->i_bmap, raw_inode); else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -872,8 +882,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) return -EINVAL; /* NILFS_I_DIRTY may remain for freeing inode */ } - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files); + list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); set_bit(NILFS_I_QUEUED, &ii->i_state); } spin_unlock(&nilfs->ns_inode_lock); @@ -892,7 +901,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) return err; } nilfs_update_inode(inode, ibh); - nilfs_mdt_mark_buffer_dirty(ibh); + mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); brelse(ibh); return 0; @@ -931,7 +940,7 @@ void nilfs_dirty_inode(struct inode *inode) int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - struct the_nilfs *nilfs = NILFS_I_NILFS(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 logical = 0, phys = 0, size = 0; __u32 flags = 0; loff_t isize; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f2469ba6246..41d6743d303 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -698,6 +698,63 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return 0; } +static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, + void __user *argp) +{ + __u64 newsize; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_from_user(&newsize, argp, sizeof(newsize))) + goto out_drop_write; + + ret = nilfs_resize_fs(inode->i_sb, newsize); + +out_drop_write: + mnt_drop_write(filp->f_path.mnt); +out: + return ret; +} + +static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 range[2]; + __u64 minseg, maxseg; + unsigned long segbytes; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EFAULT; + if (copy_from_user(range, argp, sizeof(__u64[2]))) + goto out; + + ret = -ERANGE; + if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + goto out; + + segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; + + minseg = range[0] + segbytes - 1; + do_div(minseg, segbytes); + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + do_div(maxseg, segbytes); + maxseg--; + + ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg); +out: + return ret; +} + static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, size_t membsz, @@ -763,6 +820,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); case NILFS_IOCTL_SYNC: return nilfs_ioctl_sync(inode, filp, cmd, argp); + case NILFS_IOCTL_RESIZE: + return nilfs_ioctl_resize(inode, filp, argp); + case NILFS_IOCTL_SET_ALLOC_RANGE: + return nilfs_ioctl_set_alloc_range(inode, argp); default: return -ENOTTY; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index a649b05f706..800e8d78a83 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -66,7 +66,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, kunmap_atomic(kaddr, KM_USER0); set_buffer_uptodate(bh); - nilfs_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); return 0; } @@ -355,7 +355,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) err = nilfs_mdt_read_block(inode, block, 0, &bh); if (unlikely(err)) return err; - nilfs_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); brelse(bh); return 0; @@ -450,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, INIT_LIST_HEAD(&shadow->frozen_buffers); address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, bdi); + nilfs_mapping_init(&shadow->frozen_data, inode, bdi); address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, bdi); + nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); mi->mi_shadow = shadow; return 0; } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index ed68563ec70..ab20a4baa50 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -64,11 +64,6 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) return inode->i_private; } -static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) -{ - return inode->i_sb->s_fs_info; -} - /* Default GFP flags using highmem */ #define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) @@ -93,8 +88,6 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh); struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh); -#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) - static inline void nilfs_mdt_mark_dirty(struct inode *inode) { if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) @@ -108,7 +101,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode) static inline __u64 nilfs_mdt_cno(struct inode *inode) { - return NILFS_I_NILFS(inode)->ns_cno; + return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno; } #define nilfs_mdt_bgl_lock(inode, bg) \ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 546849b3e88..1102a5fbb74 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) struct nilfs_transaction_info ti; int err; + dentry_unhash(dentry); + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); if (err) return err; @@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct nilfs_transaction_info ti; int err; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); if (unlikely(err)) return err; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a8dd344303c..a9c6a531f80 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -80,12 +80,6 @@ static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) return &ii->vfs_inode; } -static inline struct inode *NILFS_AS_I(struct address_space *mapping) -{ - return (mapping->host) ? : - container_of(mapping, struct inode, i_data); -} - /* * Dynamic state flags of NILFS on-memory inode (i_state) */ @@ -298,6 +292,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, int flip); int nilfs_commit_super(struct super_block *sb, int flag); int nilfs_cleanup_super(struct super_block *sb); +int nilfs_resize_fs(struct super_block *sb, __u64 newsize); int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, struct nilfs_root **root); int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 1168059c7ef..65221a04c6f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -37,8 +37,7 @@ #define NILFS_BUFFER_INHERENT_BITS \ ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \ - (1UL << BH_NILFS_Checked)) + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -59,19 +58,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, return bh; } -/* - * Since the page cache of B-tree node pages or data page cache of pseudo - * inodes does not have a valid mapping->host pointer, calling - * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; - * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). - * To avoid this problem, the old style mark_buffer_dirty() is used instead. - */ -void nilfs_mark_buffer_dirty(struct buffer_head *bh) -{ - if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); -} - struct buffer_head *nilfs_grab_buffer(struct inode *inode, struct address_space *mapping, unsigned long blkoff, @@ -183,7 +169,7 @@ int nilfs_page_buffers_clean(struct page *page) void nilfs_page_bug(struct page *page) { struct address_space *m; - unsigned long ino = 0; + unsigned long ino; if (unlikely(!page)) { printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); @@ -191,11 +177,8 @@ void nilfs_page_bug(struct page *page) } m = page->mapping; - if (m) { - struct inode *inode = NILFS_AS_I(m); - if (inode != NULL) - ino = inode->i_ino; - } + ino = m ? m->host->i_ino : 0; + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", page, atomic_read(&page->_count), @@ -217,56 +200,6 @@ void nilfs_page_bug(struct page *page) } /** - * nilfs_alloc_private_page - allocate a private page with buffer heads - * - * Return Value: On success, a pointer to the allocated page is returned. - * On error, NULL is returned. - */ -struct page *nilfs_alloc_private_page(struct block_device *bdev, int size, - unsigned long state) -{ - struct buffer_head *bh, *head, *tail; - struct page *page; - - page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */ - if (unlikely(!page)) - return NULL; - - lock_page(page); - head = alloc_page_buffers(page, size, 0); - if (unlikely(!head)) { - unlock_page(page); - __free_page(page); - return NULL; - } - - bh = head; - do { - bh->b_state = (1UL << BH_NILFS_Allocated) | state; - tail = bh; - bh->b_bdev = bdev; - bh = bh->b_this_page; - } while (bh); - - tail->b_this_page = head; - attach_page_buffers(page, head); - - return page; -} - -void nilfs_free_private_page(struct page *page) -{ - BUG_ON(!PageLocked(page)); - BUG_ON(page->mapping); - - if (page_has_buffers(page) && !try_to_free_buffers(page)) - NILFS_PAGE_BUG(page, "failed to free page"); - - unlock_page(page); - __free_page(page); -} - -/** * nilfs_copy_page -- copy the page with buffers * @dst: destination page * @src: source page @@ -492,10 +425,10 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi) { - mapping->host = NULL; + mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->assoc_mapping = NULL; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index f06b79ad749..fb7de71605a 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -38,14 +38,12 @@ enum { BH_NILFS_Redirected, }; -BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ BUFFER_FNS(NILFS_Volatile, nilfs_volatile) BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ -void nilfs_mark_buffer_dirty(struct buffer_head *bh); int __nilfs_clear_page_dirty(struct page *); struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, @@ -54,14 +52,11 @@ void nilfs_forget_buffer(struct buffer_head *); void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); int nilfs_page_buffers_clean(struct page *); void nilfs_page_bug(struct page *); -struct page *nilfs_alloc_private_page(struct block_device *, int, - unsigned long); -void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init(struct address_space *mapping, +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ba4a64518f3..a604ac0331b 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -387,9 +387,9 @@ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { - struct nilfs_recovery_block *rb - = list_entry(head->next, - struct nilfs_recovery_block, list); + struct nilfs_recovery_block *rb; + + rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } @@ -416,9 +416,9 @@ static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { - struct nilfs_segment_entry *ent - = list_entry(head->next, - struct nilfs_segment_entry, list); + struct nilfs_segment_entry *ent; + + ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2853ff20f85..850a7c0228f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -239,12 +239,15 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct nilfs_super_root *raw_sr; + struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; + unsigned srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); crc = crc32_le(seed, (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), - NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + srsize - sizeof(raw_sr->sr_sum)); raw_sr->sr_sum = cpu_to_le32(crc); } @@ -254,18 +257,6 @@ static void nilfs_release_buffers(struct list_head *list) list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); - if (buffer_nilfs_allocated(bh)) { - struct page *clone_page = bh->b_page; - - /* remove clone page */ - brelse(bh); - page_cache_release(clone_page); /* for each bh */ - if (page_count(clone_page) <= 2) { - lock_page(clone_page); - nilfs_free_private_page(clone_page); - } - continue; - } brelse(bh); } } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index afe4f218345..141646e88fb 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -655,13 +655,10 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, if (unlikely(page->index > last)) break; - if (mapping->host) { - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, - 1 << inode->i_blkbits, 0); - unlock_page(page); - } + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + unlock_page(page); bh = head = page_buffers(page); do { @@ -809,7 +806,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) /* The following code is duplicated with cpfile. But, it is needed to collect the checkpoint even if it was not newly created */ - nilfs_mdt_mark_buffer_dirty(bh_cp); + mark_buffer_dirty(bh_cp); nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); @@ -889,12 +886,14 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; - unsigned isz = nilfs->ns_inode_size; + unsigned isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + isz = nilfs->ns_inode_size; + srsz = NILFS_SR_BYTES(isz); - raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); + raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? nilfs->ns_nongc_ctime : sci->sc_seg_ctime); @@ -906,6 +905,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, NILFS_SR_CPFILE_OFFSET(isz), 1); nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); } static void nilfs_redirty_inodes(struct list_head *head) @@ -954,8 +954,8 @@ static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, dispose_buffers: while (!list_empty(listp)) { - bh = list_entry(listp->next, struct buffer_head, - b_assoc_buffers); + bh = list_first_entry(listp, struct buffer_head, + b_assoc_buffers); list_del_init(&bh->b_assoc_buffers); brelse(bh); } @@ -1500,10 +1500,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); - if (buffer_nilfs_node(bh)) - inode = NILFS_BTNC_I(bh->b_page->mapping); - else - inode = NILFS_AS_I(bh->b_page->mapping); + inode = bh->b_page->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; @@ -1556,83 +1553,24 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) return 0; } -static int -nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) -{ - struct page *clone_page; - struct buffer_head *bh, *head, *bh2; - void *kaddr; - - bh = head = page_buffers(page); - - clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0); - if (unlikely(!clone_page)) - return -ENOMEM; - - bh2 = page_buffers(clone_page); - kaddr = kmap_atomic(page, KM_USER0); - do { - if (list_empty(&bh->b_assoc_buffers)) - continue; - get_bh(bh2); - page_cache_get(clone_page); /* for each bh */ - memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size); - bh2->b_blocknr = bh->b_blocknr; - list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers); - list_add_tail(&bh->b_assoc_buffers, out); - } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); - kunmap_atomic(kaddr, KM_USER0); - - if (!TestSetPageWriteback(clone_page)) - account_page_writeback(clone_page); - unlock_page(clone_page); - - return 0; -} - -static int nilfs_test_page_to_be_frozen(struct page *page) -{ - struct address_space *mapping = page->mapping; - - if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode)) - return 0; - - if (page_mapped(page)) { - ClearPageChecked(page); - return 1; - } - return PageChecked(page); -} - -static int nilfs_begin_page_io(struct page *page, struct list_head *out) +static void nilfs_begin_page_io(struct page *page) { if (!page || PageWriteback(page)) /* For split b-tree node pages, this function may be called twice. We ignore the 2nd or later calls by this check. */ - return 0; + return; lock_page(page); clear_page_dirty_for_io(page); set_page_writeback(page); unlock_page(page); - - if (nilfs_test_page_to_be_frozen(page)) { - int err = nilfs_copy_replace_page_buffers(page, out); - if (unlikely(err)) - return err; - } - return 0; } -static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, - struct page **failed_page) +static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; - struct list_head *list = &sci->sc_copied_buffers; - int err; - *failed_page = NULL; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { struct buffer_head *bh; @@ -1662,11 +1600,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, break; } if (bh->b_page != fs_page) { - err = nilfs_begin_page_io(fs_page, list); - if (unlikely(err)) { - *failed_page = fs_page; - goto out; - } + nilfs_begin_page_io(fs_page); fs_page = bh->b_page; } } @@ -1677,11 +1611,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, set_page_writeback(bd_page); unlock_page(bd_page); } - err = nilfs_begin_page_io(fs_page, list); - if (unlikely(err)) - *failed_page = fs_page; - out: - return err; + nilfs_begin_page_io(fs_page); } static int nilfs_segctor_write(struct nilfs_sc_info *sci, @@ -1694,24 +1624,6 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci, return ret; } -static void __nilfs_end_page_io(struct page *page, int err) -{ - if (!err) { - if (!nilfs_page_buffers_clean(page)) - __set_page_dirty_nobuffers(page); - ClearPageError(page); - } else { - __set_page_dirty_nobuffers(page); - SetPageError(page); - } - - if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) - dec_zone_page_state(page, NR_WRITEBACK); - } else - end_page_writeback(page); -} - static void nilfs_end_page_io(struct page *page, int err) { if (!page) @@ -1738,40 +1650,19 @@ static void nilfs_end_page_io(struct page *page, int err) return; } - __nilfs_end_page_io(page, err); -} - -static void nilfs_clear_copied_buffers(struct list_head *list, int err) -{ - struct buffer_head *bh, *head; - struct page *page; - - while (!list_empty(list)) { - bh = list_entry(list->next, struct buffer_head, - b_assoc_buffers); - page = bh->b_page; - page_cache_get(page); - head = bh = page_buffers(page); - do { - if (!list_empty(&bh->b_assoc_buffers)) { - list_del_init(&bh->b_assoc_buffers); - if (!err) { - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - clear_buffer_delay(bh); - clear_buffer_nilfs_volatile(bh); - } - brelse(bh); /* for b_assoc_buffers */ - } - } while ((bh = bh->b_this_page) != head); - - __nilfs_end_page_io(page, err); - page_cache_release(page); + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); } + + end_page_writeback(page); } -static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, - int err) +static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; @@ -1801,8 +1692,6 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, } if (bh->b_page != fs_page) { nilfs_end_page_io(fs_page, err); - if (fs_page && fs_page == failed_page) - return; fs_page = bh->b_page; } } @@ -1821,12 +1710,11 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, list_splice_tail_init(&sci->sc_write_logs, &logs); ret = nilfs_wait_on_logs(&logs); - nilfs_abort_logs(&logs, NULL, ret ? : err); + nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); - nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); if (sci->sc_stage.flags & NILFS_CF_SUFREED) { ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, @@ -1920,8 +1808,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) nilfs_end_page_io(fs_page, 0); - nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0); - nilfs_drop_collected_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) @@ -1979,7 +1865,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, "failed to get inode block.\n"); return err; } - nilfs_mdt_mark_buffer_dirty(ibh); + mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) @@ -1991,8 +1877,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); + list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); } spin_unlock(&nilfs->ns_inode_lock); @@ -2014,8 +1899,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, clear_bit(NILFS_I_BUSY, &ii->i_state); brelse(ii->i_bh); ii->i_bh = NULL; - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &ti->ti_garbage); + list_move_tail(&ii->i_dirty, &ti->ti_garbage); } spin_unlock(&nilfs->ns_inode_lock); } @@ -2026,7 +1910,6 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct page *failed_page; int err; sci->sc_stage.scnt = NILFS_ST_INIT; @@ -2081,11 +1964,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); /* Write partial segments */ - err = nilfs_segctor_prepare_write(sci, &failed_page); - if (err) { - nilfs_abort_logs(&sci->sc_segbufs, failed_page, err); - goto failed_to_write; - } + nilfs_segctor_prepare_write(sci); nilfs_add_checksums_on_logs(&sci->sc_segbufs, nilfs->ns_crc_seed); @@ -2687,7 +2566,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_segbufs); INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); - INIT_LIST_HEAD(&sci->sc_copied_buffers); init_timer(&sci->sc_timer); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; @@ -2741,8 +2619,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); - WARN_ON(!list_empty(&sci->sc_copied_buffers)); - if (!list_empty(&sci->sc_dirty_files)) { nilfs_warning(sci->sc_super, __func__, "dirty file(s) after the final construction\n"); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 6c02a86745f..38a1d001331 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -92,7 +92,6 @@ struct nilfs_segsum_pointer { * @sc_nblk_inc: Block count of current generation * @sc_dirty_files: List of files to be written * @sc_gc_inodes: List of GC inodes having blocks to be written - * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data * @sc_freesegs: array of segment numbers to be freed * @sc_nfreesegs: number of segments on @sc_freesegs * @sc_dsync_inode: inode whose data pages are written for a sync operation @@ -136,7 +135,6 @@ struct nilfs_sc_info { struct list_head sc_dirty_files; struct list_head sc_gc_inodes; - struct list_head sc_copied_buffers; __u64 *sc_freesegs; size_t sc_nfreesegs; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1d6f488ccae..0a0aba617d8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -33,7 +33,9 @@ struct nilfs_sufile_info { struct nilfs_mdt_info mi; - unsigned long ncleansegs; + unsigned long ncleansegs;/* number of clean segments */ + __u64 allocmin; /* lower limit of allocatable segment range */ + __u64 allocmax; /* upper limit of allocatable segment range */ }; static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) @@ -96,6 +98,13 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, create, NULL, bhp); } +static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, + __u64 segnum) +{ + return nilfs_mdt_delete_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum)); +} + static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { @@ -108,7 +117,7 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); } /** @@ -248,6 +257,35 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, } /** + * nilfs_sufile_set_alloc_range - limit range of segment to be allocated + * @sufile: inode of segment usage file + * @start: minimum segment number of allocatable region (inclusive) + * @end: maximum segment number of allocatable region (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-ERANGE - invalid segment region + */ +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + __u64 nsegs; + int ret = -ERANGE; + + down_write(&NILFS_MDT(sufile)->mi_sem); + nsegs = nilfs_sufile_get_nsegments(sufile); + + if (start <= end && end < nsegs) { + sui->allocmin = start; + sui->allocmax = end; + ret = 0; + } + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** * nilfs_sufile_alloc - allocate a segment * @sufile: inode of segment usage file * @segnump: pointer to segment number @@ -269,11 +307,12 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) struct buffer_head *header_bh, *su_bh; struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; void *kaddr; - unsigned long nsegments, ncleansegs, nsus; - int ret, i, j; + unsigned long nsegments, ncleansegs, nsus, cnt; + int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -287,13 +326,31 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) kunmap_atomic(kaddr, KM_USER0); nsegments = nilfs_sufile_get_nsegments(sufile); + maxsegnum = sui->allocmax; segnum = last_alloc + 1; - maxsegnum = nsegments - 1; - for (i = 0; i < nsegments; i += nsus) { - if (segnum >= nsegments) { - /* wrap around */ - segnum = 0; - maxsegnum = last_alloc; + if (segnum < sui->allocmin || segnum > sui->allocmax) + segnum = sui->allocmin; + + for (cnt = 0; cnt < nsegments; cnt += nsus) { + if (segnum > maxsegnum) { + if (cnt < sui->allocmax - sui->allocmin + 1) { + /* + * wrap around in the limited region. + * if allocation started from + * sui->allocmin, this never happens. + */ + segnum = sui->allocmin; + maxsegnum = last_alloc; + } else if (segnum > sui->allocmin && + sui->allocmax + 1 < nsegments) { + segnum = sui->allocmax + 1; + maxsegnum = nsegments - 1; + } else if (sui->allocmin > 0) { + segnum = 0; + maxsegnum = sui->allocmin - 1; + } else { + break; /* never happens */ + } } ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); @@ -319,9 +376,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) header->sh_last_alloc = cpu_to_le64(segnum); kunmap_atomic(kaddr, KM_USER0); - NILFS_SUI(sufile)->ncleansegs--; - nilfs_mdt_mark_buffer_dirty(header_bh); - nilfs_mdt_mark_buffer_dirty(su_bh); + sui->ncleansegs--; + mark_buffer_dirty(header_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; @@ -364,7 +421,7 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -395,7 +452,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -421,7 +478,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, sudirty = nilfs_segment_usage_dirty(su); nilfs_segment_usage_set_clean(su); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); NILFS_SUI(sufile)->ncleansegs++; @@ -441,7 +498,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (!ret) { - nilfs_mdt_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); } @@ -476,7 +533,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_nblocks = cpu_to_le32(nblocks); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -505,7 +562,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) { struct buffer_head *header_bh; struct nilfs_sufile_header *header; - struct the_nilfs *nilfs = NILFS_I_NILFS(sufile); + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; void *kaddr; int ret; @@ -555,11 +612,183 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, -1, 0); NILFS_SUI(sufile)->ncleansegs--; } - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } /** + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ +static int nilfs_sufile_truncate_range(struct inode *sufile, + __u64 start, __u64 end) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su, *su2; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + unsigned long segusages_per_block; + unsigned long nsegs, ncleaned; + __u64 segnum; + void *kaddr; + ssize_t n, nc; + int ret; + int j; + + nsegs = nilfs_sufile_get_nsegments(sufile); + + ret = -EINVAL; + if (start > end || start >= nsegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + ncleaned = 0; + + for (segnum = start; segnum <= end; segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + end - segnum + 1); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_header; + /* hole */ + continue; + } + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + su2 = su; + for (j = 0; j < n; j++, su = (void *)su + susz) { + if ((le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + nilfs_segment_is_active(nilfs, segnum + j)) { + ret = -EBUSY; + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + goto out_header; + } + } + nc = 0; + for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { + if (nilfs_segment_usage_error(su)) { + nilfs_segment_usage_set_clean(su); + nc++; + } + } + kunmap_atomic(kaddr, KM_USER0); + if (nc > 0) { + mark_buffer_dirty(su_bh); + ncleaned += nc; + } + brelse(su_bh); + + if (n == segusages_per_block) { + /* make hole */ + nilfs_sufile_delete_segment_usage_block(sufile, segnum); + } + } + ret = 0; + +out_header: + if (ncleaned > 0) { + NILFS_SUI(sufile)->ncleansegs += ncleaned; + nilfs_sufile_mod_counter(header_bh, ncleaned, 0); + nilfs_mdt_mark_dirty(sufile); + } + brelse(header_bh); +out: + return ret; +} + +/** + * nilfs_sufile_resize - resize segment array + * @sufile: inode of segment usage file + * @newnsegs: new number of segments + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - Enough free space is not left for shrinking + * + * %-EBUSY - Dirty or active segments exist in the region to be truncated + */ +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + void *kaddr; + unsigned long nsegs, nrsvsegs; + int ret = 0; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nsegs = nilfs_sufile_get_nsegments(sufile); + if (nsegs == newnsegs) + goto out; + + ret = -ENOSPC; + nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); + if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + if (newnsegs > nsegs) { + sui->ncleansegs += newnsegs - nsegs; + } else /* newnsegs < nsegs */ { + ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); + if (ret < 0) + goto out_header; + + sui->ncleansegs -= nsegs - newnsegs; + } + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + nilfs_set_nsegments(nilfs, newnsegs); + +out_header: + brelse(header_bh); +out: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** * nilfs_sufile_get_suinfo - * @sufile: inode of segment usage file * @segnum: segment number to start looking @@ -583,7 +812,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, struct nilfs_segment_usage *su; struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; - struct the_nilfs *nilfs = NILFS_I_NILFS(sufile); + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; @@ -679,6 +908,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, kunmap_atomic(kaddr, KM_USER0); brelse(header_bh); + sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; + sui->allocmin = 0; + unlock_new_inode(sufile); out: *inodep = sufile; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a943fbacb45..e84bc5b51fc 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -31,11 +31,12 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { - return NILFS_I_NILFS(sufile)->ns_nsegments; + return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments; } unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, @@ -61,6 +62,7 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 062cca06519..8351c44a732 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -56,6 +56,7 @@ #include "btnode.h" #include "page.h" #include "cpfile.h" +#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */ #include "ifile.h" #include "dat.h" #include "segment.h" @@ -165,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_state = 0; ii->i_cno = 0; ii->vfs_inode.i_version = 1; - nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi); + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); return &ii->vfs_inode; } @@ -347,6 +348,134 @@ int nilfs_cleanup_super(struct super_block *sb) return ret; } +/** + * nilfs_move_2nd_super - relocate secondary super block + * @sb: super block instance + * @sb2off: new offset of the secondary super block (in bytes) + */ +static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *nsbh; + struct nilfs_super_block *nsbp; + sector_t blocknr, newblocknr; + unsigned long offset; + int sb2i = -1; /* array index of the secondary superblock */ + int ret = 0; + + /* nilfs->ns_sem must be locked by the caller. */ + if (nilfs->ns_sbh[1] && + nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 1; + blocknr = nilfs->ns_sbh[1]->b_blocknr; + } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 0; + blocknr = nilfs->ns_sbh[0]->b_blocknr; + } + if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) + goto out; /* super block location is unchanged */ + + /* Get new super block buffer */ + newblocknr = sb2off >> nilfs->ns_blocksize_bits; + offset = sb2off & (nilfs->ns_blocksize - 1); + nsbh = sb_getblk(sb, newblocknr); + if (!nsbh) { + printk(KERN_WARNING + "NILFS warning: unable to move secondary superblock " + "to block %llu\n", (unsigned long long)newblocknr); + ret = -EIO; + goto out; + } + nsbp = (void *)nsbh->b_data + offset; + memset(nsbp, 0, nilfs->ns_blocksize); + + if (sb2i >= 0) { + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + brelse(nilfs->ns_sbh[sb2i]); + nilfs->ns_sbh[sb2i] = nsbh; + nilfs->ns_sbp[sb2i] = nsbp; + } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) { + /* secondary super block will be restored to index 1 */ + nilfs->ns_sbh[1] = nsbh; + nilfs->ns_sbp[1] = nsbp; + } else { + brelse(nsbh); + } +out: + return ret; +} + +/** + * nilfs_resize_fs - resize the filesystem + * @sb: super block instance + * @newsize: new size of the filesystem (in bytes) + */ +int nilfs_resize_fs(struct super_block *sb, __u64 newsize) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + __u64 devsize, newnsegs; + loff_t sb2off; + int ret; + + ret = -ERANGE; + devsize = i_size_read(sb->s_bdev->bd_inode); + if (newsize > devsize) + goto out; + + /* + * Write lock is required to protect some functions depending + * on the number of segments, the number of reserved segments, + * and so forth. + */ + down_write(&nilfs->ns_segctor_sem); + + sb2off = NILFS_SB2_OFFSET_BYTES(newsize); + newnsegs = sb2off >> nilfs->ns_blocksize_bits; + do_div(newnsegs, nilfs->ns_blocks_per_segment); + + ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); + up_write(&nilfs->ns_segctor_sem); + if (ret < 0) + goto out; + + ret = nilfs_construct_segment(sb); + if (ret < 0) + goto out; + + down_write(&nilfs->ns_sem); + nilfs_move_2nd_super(sb, sb2off); + ret = -EIO; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + /* + * Drop NILFS_RESIZE_FS flag for compatibility with + * mount-time resize which may be implemented in a + * future release. + */ + sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & + ~NILFS_RESIZE_FS); + sbp[0]->s_dev_size = cpu_to_le64(newsize); + sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + up_write(&nilfs->ns_sem); + + /* + * Reset the range of allocatable segments last. This order + * is important in the case of expansion because the secondary + * superblock must be protected from log write until migration + * completes. + */ + if (!ret) + nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1); +out: + return ret; +} + static void nilfs_put_super(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d2acd1a651f..d3271409437 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -363,6 +363,24 @@ static unsigned long long nilfs_max_size(unsigned int blkbits) return res; } +/** + * nilfs_nrsvsegs - calculate the number of reserved segments + * @nilfs: nilfs object + * @nsegs: total number of segments + */ +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) +{ + return max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage, + 100)); +} + +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) +{ + nilfs->ns_nsegments = nsegs; + nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs); +} + static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { @@ -389,13 +407,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, } nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); - nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments); nilfs->ns_r_segments_percentage = le32_to_cpu(sbp->s_r_segments_percentage); - nilfs->ns_nrsvsegs = - max_t(unsigned long, NILFS_MIN_NRSVSEGS, - DIV_ROUND_UP(nilfs->ns_nsegments * - nilfs->ns_r_segments_percentage, 100)); + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index f4968145c2a..9992b11312f 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -268,6 +268,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev); void destroy_nilfs(struct the_nilfs *nilfs); int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index d8a0313e99e..f17e58b3298 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -30,6 +30,7 @@ ocfs2-objs := \ namei.o \ refcounttree.o \ reservations.o \ + move_extents.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 48aa9c7401c..ed553c60de8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -29,6 +29,7 @@ #include <linux/highmem.h> #include <linux/swap.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include <cluster/masklog.h> @@ -7184,3 +7185,168 @@ out_commit: out: return ret; } + +static int ocfs2_trim_extent(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 count) +{ + u64 discard, bcount; + + bcount = ocfs2_clusters_to_blocks(sb, count); + discard = le64_to_cpu(gd->bg_blkno) + + ocfs2_clusters_to_blocks(sb, start); + + trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); + + return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); +} + +static int ocfs2_trim_group(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 max, u32 minbits) +{ + int ret = 0, count = 0, next; + void *bitmap = gd->bg_bitmap; + + if (le16_to_cpu(gd->bg_free_bits_count) < minbits) + return 0; + + trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), + start, max, minbits); + + while (start < max) { + start = ocfs2_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = ocfs2_find_next_bit(bitmap, max, start); + + if ((next - start) >= minbits) { + ret = ocfs2_trim_extent(sb, gd, + start, next - start); + if (ret < 0) { + mlog_errno(ret); + break; + } + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) + break; + } + + if (ret < 0) + count = ret; + + return count; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 start, len, trimmed, first_group, last_group, group; + int ret, cnt; + u32 first_bit, last_bit, minlen; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_dinode *main_bm; + struct ocfs2_group_desc *gd = NULL; + + start = range->start >> osb->s_clustersize_bits; + len = range->len >> osb->s_clustersize_bits; + minlen = range->minlen >> osb->s_clustersize_bits; + trimmed = 0; + + if (!len) { + range->len = 0; + return 0; + } + + if (minlen >= osb->bitmap_cpg) + return -EINVAL; + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; + goto out_unlock; + } + + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; + + trace_ocfs2_trim_fs(start, len, minlen); + + /* Determine first and last group to examine based on start and len */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); + last_bit = osb->bitmap_cpg; + + for (group = first_group; group <= last_group;) { + if (first_bit + len >= osb->bitmap_cpg) + last_bit = osb->bitmap_cpg; + else + last_bit = first_bit + len; + + ret = ocfs2_read_group_descriptor(main_bm_inode, + main_bm, group, + &gd_bh); + if (ret < 0) { + mlog_errno(ret); + break; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + brelse(gd_bh); + gd_bh = NULL; + if (cnt < 0) { + ret = cnt; + mlog_errno(ret); + break; + } + + trimmed += cnt; + len -= osb->bitmap_cpg - first_bit; + first_bit = 0; + if (group == osb->first_cluster_group_blkno) + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + else + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + } + range->len = trimmed * sb->s_blocksize; +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 0); + brelse(main_bm_bh); +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); +out: + return ret; +} diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 3bd08a03251..ca381c58412 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci, struct buffer_head **leaf_bh); int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range); /* * Helper function to look at the # of clusters in an extent record. */ diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index bc702dab5d1..a4b07730b2e 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -57,7 +57,6 @@ static struct kset *o2cb_kset; void o2cb_sys_shutdown(void) { mlog_sys_shutdown(); - sysfs_remove_link(NULL, "o2cb"); kset_unregister(o2cb_kset); } @@ -69,14 +68,6 @@ int o2cb_sys_init(void) if (!o2cb_kset) return -ENOMEM; - /* - * Create this symlink for backwards compatibility with old - * versions of ocfs2-tools which look for things in /sys/o2cb. - */ - ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb"); - if (ret) - goto error; - ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); if (ret) goto error; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 4bdf7baee34..d602abb51b6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -144,6 +144,7 @@ struct dlm_ctxt wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct dlm_recovery_ctxt reco; spinlock_t master_lock; @@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb) return 1; } +static inline char *dlm_list_in_text(enum dlm_lockres_list idx) +{ + if (idx == DLM_GRANTED_LIST) + return "granted"; + else if (idx == DLM_CONVERTING_LIST) + return "converting"; + else if (idx == DLM_BLOCKED_LIST) + return "blocked"; + else + return "unknown"; +} + static inline struct list_head * dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) { @@ -448,6 +461,7 @@ enum { DLM_FINALIZE_RECO_MSG = 518, DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, + DLM_BEGIN_EXIT_DOMAIN_MSG = 521, }; struct dlm_reco_node_data diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 04a32be0aeb..56f82cb912e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) buf + out, len - out); out += snprintf(buf + out, len - out, "\n"); + /* Exit Domain Map: xx xx xx */ + out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, + buf + out, len - out); + out += snprintf(buf + out, len - out, "\n"); + /* Live Map: xx xx xx */ out += snprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 3b179d6cbde..6ed6b95dcf9 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * New in version 1.1: * - Message DLM_QUERY_REGION added to support global heartbeat * - Message DLM_QUERY_NODEINFO added to allow online node removes + * New in version 1.2: + * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 1, + .pv_minor = 2, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -449,14 +451,18 @@ redo_bucket: dropped = dlm_empty_lockres(dlm, res); spin_lock(&res->spinlock); - __dlm_lockres_calc_usage(dlm, res); - iter = res->hash_node.next; + if (dropped) + __dlm_lockres_calc_usage(dlm, res); + else + iter = res->hash_node.next; spin_unlock(&res->spinlock); dlm_lockres_put(res); - if (dropped) + if (dropped) { + cond_resched_lock(&dlm->spinlock); goto redo_bucket; + } } cond_resched_lock(&dlm->spinlock); num += n; @@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm) return ret; } +static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len, + void *data, void **ret_data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node); + + spin_lock(&dlm->spinlock); + set_bit(node, dlm->exit_domain_map); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) { /* Yikes, a double spinlock! I need domain_lock for the dlm @@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); + clear_bit(node, dlm->exit_domain_map); __dlm_print_nodes(dlm); /* notify anything attached to the heartbeat events */ @@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, return 0; } -static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, +static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type, unsigned int node) { int status; struct dlm_exit_domain leave_msg; - mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", - node, dlm->name, dlm->node_num); + mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name, + msg_type, node); memset(&leave_msg, 0, sizeof(leave_msg)); leave_msg.node_idx = dlm->node_num; - status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, - &leave_msg, sizeof(leave_msg), node, - NULL); + status = o2net_send_message(msg_type, dlm->key, &leave_msg, + sizeof(leave_msg), node, NULL); if (status < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); - mlog(0, "status return %d from o2net_send_message\n", status); + mlog(ML_ERROR, "Error %d sending domain exit message %u " + "to node %u on domain %s\n", status, msg_type, node, + dlm->name); return status; } +static void dlm_begin_exit_domain(struct dlm_ctxt *dlm) +{ + int node = -1; + + /* Support for begin exit domain was added in 1.2 */ + if (dlm->dlm_locking_proto.pv_major == 1 && + dlm->dlm_locking_proto.pv_minor < 2) + return; + + /* + * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely + * informational. Meaning if a node does not receive the message, + * so be it. + */ + spin_lock(&dlm->spinlock); + while (1) { + node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1); + if (node >= O2NM_MAX_NODES) + break; + if (node == dlm->node_num) + continue; + + spin_unlock(&dlm->spinlock); + dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node); + spin_lock(&dlm->spinlock); + } + spin_unlock(&dlm->spinlock); +} static void dlm_leave_domain(struct dlm_ctxt *dlm) { @@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) clear_node = 1; - status = dlm_send_one_domain_exit(dlm, node); + status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG, + node); if (status < 0 && status != -ENOPROTOOPT && status != -ENOTCONN) { @@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) if (leave) { mlog(0, "shutting down domain %s\n", dlm->name); + dlm_begin_exit_domain(dlm); /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); @@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); set_bit(assert->node_idx, dlm->domain_map); + clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", @@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) if (status) goto bail; + status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_begin_exit_domain_handler, + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 84d166328cf..11eefb8c12e 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) dlm_lockres_put(res); } -/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 - * if not. If 0, numlocks is set to the number of locks in the lockres. +/* + * A migrateable resource is one that is : + * 1. locally mastered, and, + * 2. zero local locks, and, + * 3. one or more non-local locks, or, one or more references + * Returns 1 if yes, 0 if not. */ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - int *numlocks, - int *hasrefs) + struct dlm_lock_resource *res) { - int ret; - int i; - int count = 0; + enum dlm_lockres_list idx; + int nonlocal = 0, node_ref; struct list_head *queue; struct dlm_lock *lock; + u64 cookie; assert_spin_locked(&res->spinlock); - *numlocks = 0; - *hasrefs = 0; - - ret = -EINVAL; - if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { - mlog(0, "cannot migrate lockres with unknown owner!\n"); - goto leave; - } - - if (res->owner != dlm->node_num) { - mlog(0, "cannot migrate lockres this node doesn't own!\n"); - goto leave; - } + if (res->owner != dlm->node_num) + return 0; - ret = 0; - queue = &res->granted; - for (i = 0; i < 3; i++) { + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); list_for_each_entry(lock, queue, list) { - ++count; - if (lock->ml.node == dlm->node_num) { - mlog(0, "found a lock owned by this node still " - "on the %s queue! will not migrate this " - "lockres\n", (i == 0 ? "granted" : - (i == 1 ? "converting" : - "blocked"))); - ret = -ENOTEMPTY; - goto leave; + if (lock->ml.node != dlm->node_num) { + nonlocal++; + continue; } + cookie = be64_to_cpu(lock->ml.cookie); + mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + "%s list\n", dlm->name, res->lockname.len, + res->lockname.name, + dlm_get_lock_cookie_node(cookie), + dlm_get_lock_cookie_seq(cookie), + dlm_list_in_text(idx)); + return 0; } - queue++; } - *numlocks = count; - - count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (count < O2NM_MAX_NODES) - *hasrefs = 1; + if (!nonlocal) { + node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (node_ref >= O2NM_MAX_NODES) + return 0; + } - mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, - res->lockname.len, res->lockname.name, *numlocks, *hasrefs); + mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + res->lockname.name); -leave: - return ret; + return 1; } /* @@ -2406,8 +2396,7 @@ leave: static int dlm_migrate_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 target) + struct dlm_lock_resource *res, u8 target) { struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; @@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, const char *name; unsigned int namelen; int mle_added = 0; - int numlocks, hasrefs; int wake = 0; if (!dlm_grab(dlm)) return -EINVAL; + BUG_ON(target == O2NM_MAX_NODES); + name = res->lockname.name; namelen = res->lockname.len; - mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); - - /* - * ensure this lockres is a proper candidate for migration - */ - spin_lock(&res->spinlock); - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); - if (ret < 0) { - spin_unlock(&res->spinlock); - goto leave; - } - spin_unlock(&res->spinlock); - - /* no work to do */ - if (numlocks == 0 && !hasrefs) - goto leave; - - /* - * preallocate up front - * if this fails, abort - */ + mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, + target); + /* preallocate up front. if this fails, abort */ ret = -ENOMEM; mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { @@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, ret = 0; /* - * find a node to migrate the lockres to - */ - - spin_lock(&dlm->spinlock); - /* pick a new node */ - if (!test_bit(target, dlm->domain_map) || - target >= O2NM_MAX_NODES) { - target = dlm_pick_migration_target(dlm, res); - } - mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name, - namelen, name, target); - - if (target >= O2NM_MAX_NODES || - !test_bit(target, dlm->domain_map)) { - /* target chosen is not alive */ - ret = -EINVAL; - } - - if (ret) { - spin_unlock(&dlm->spinlock); - goto fail; - } - - mlog(0, "continuing with target = %u\n", target); - - /* * clear any existing master requests and * add the migration mle to the list */ + spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); @@ -2531,6 +2478,7 @@ fail: dlm_put_mle(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); + mle = NULL; } goto leave; } @@ -2652,69 +2600,52 @@ leave: if (wake) wake_up(&res->wq); - /* TODO: cleanup */ if (mres) free_page((unsigned long)mres); dlm_put(dlm); - mlog(0, "returning %d\n", ret); + mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, + name, target, ret); return ret; } #define DLM_MIGRATION_RETRY_MS 100 -/* Should be called only after beginning the domain leave process. +/* + * Should be called only after beginning the domain leave process. * There should not be any remaining locks on nonlocal lock resources, * and there should be no local locks left on locally mastered resources. * * Called with the dlm spinlock held, may drop it to do migration, but * will re-acquire before exit. * - * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped + */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; - int numlocks, hasrefs; + u8 target = O2NM_MAX_NODES; + + assert_spin_locked(&dlm->spinlock); spin_lock(&res->spinlock); - if (res->owner != dlm->node_num) { - if (!__dlm_lockres_unused(res)) { - mlog(ML_ERROR, "%s:%.*s: this node is not master, " - "trying to free this but locks remain\n", - dlm->name, res->lockname.len, res->lockname.name); - } - spin_unlock(&res->spinlock); - goto leave; - } + if (dlm_is_lockres_migrateable(dlm, res)) + target = dlm_pick_migration_target(dlm, res); + spin_unlock(&res->spinlock); - /* No need to migrate a lockres having no locks */ - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); - if (ret >= 0 && numlocks == 0 && !hasrefs) { - spin_unlock(&res->spinlock); + if (target == O2NM_MAX_NODES) goto leave; - } - spin_unlock(&res->spinlock); /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ spin_unlock(&dlm->spinlock); lock_dropped = 1; - while (1) { - ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); - if (ret >= 0) - break; - if (ret == -ENOTEMPTY) { - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", - res->lockname.len, res->lockname.name); - BUG(); - } - - mlog(0, "lockres %.*s: migrate failed, " - "retrying\n", res->lockname.len, - res->lockname.name); - msleep(DLM_MIGRATION_RETRY_MS); - } + ret = dlm_migrate_lockres(dlm, res, target); + if (ret) + mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", + dlm->name, res->lockname.len, res->lockname.name, + target, ret); spin_lock(&dlm->spinlock); leave: return lock_dropped; @@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, } } -/* for now this is not too intelligent. we will - * need stats to make this do the right thing. - * this just finds the first lock on one of the - * queues and uses that node as the target. */ +/* + * Pick a node to migrate the lock resource to. This function selects a + * potential target based first on the locks and then on refmap. It skips + * nodes that are in the process of exiting the domain. + */ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - int i; + enum dlm_lockres_list idx; struct list_head *queue = &res->granted; struct dlm_lock *lock; - int nodenum; + int noderef; + u8 nodenum = O2NM_MAX_NODES; assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); - spin_lock(&res->spinlock); - for (i=0; i<3; i++) { + /* Go through all the locks */ + for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { + queue = dlm_list_idx_to_ptr(res, idx); list_for_each_entry(lock, queue, list) { - /* up to the caller to make sure this node - * is alive */ - if (lock->ml.node != dlm->node_num) { - spin_unlock(&res->spinlock); - return lock->ml.node; - } + if (lock->ml.node == dlm->node_num) + continue; + if (test_bit(lock->ml.node, dlm->exit_domain_map)) + continue; + nodenum = lock->ml.node; + goto bail; } - queue++; - } - - nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (nodenum < O2NM_MAX_NODES) { - spin_unlock(&res->spinlock); - return nodenum; } - spin_unlock(&res->spinlock); - mlog(0, "have not found a suitable target yet! checking domain map\n"); - /* ok now we're getting desperate. pick anyone alive. */ - nodenum = -1; + /* Go thru the refmap */ + noderef = -1; while (1) { - nodenum = find_next_bit(dlm->domain_map, - O2NM_MAX_NODES, nodenum+1); - mlog(0, "found %d in domain map\n", nodenum); - if (nodenum >= O2NM_MAX_NODES) + noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, + noderef + 1); + if (noderef >= O2NM_MAX_NODES) break; - if (nodenum != dlm->node_num) { - mlog(0, "picking %d\n", nodenum); - return nodenum; - } + if (noderef == dlm->node_num) + continue; + if (test_bit(noderef, dlm->exit_domain_map)) + continue; + nodenum = noderef; + goto bail; } - mlog(0, "giving up. no master to migrate to\n"); - return DLM_LOCK_RES_OWNER_UNKNOWN; +bail: + return nodenum; } - - /* this is called by the new master once all lockres * data has been received */ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f1beb6fc254..7efab6d28a2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) mlog(0, "node %u being removed from domain map!\n", idx); clear_bit(idx, dlm->domain_map); + clear_bit(idx, dlm->exit_domain_map); /* wake up migration waiters if a node goes down. * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8c5c0eddc36..b4207679704 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker; * signifies a bast fired on the lock. */ #define DLMFS_CAPABILITIES "bast stackglue" -extern int param_set_dlmfs_capabilities(const char *val, +static int param_set_dlmfs_capabilities(const char *val, struct kernel_param *kp) { printk(KERN_ERR "%s: readonly parameter\n", kp->name); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 89659d6dc20..b1e35a392ca 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 8f13c5989ea..bc91072b721 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -22,6 +22,11 @@ #include "ioctl.h" #include "resize.h" #include "refcounttree.h" +#include "sysfile.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "suballoc.h" +#include "move_extents.h" #include <linux/ext2_fs.h> @@ -35,31 +40,27 @@ * be -EFAULT. The error will be returned from the ioctl(2) call. It's * just a best-effort to tell userspace that this request caused the error. */ -static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, +static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } -#define o2info_set_request_error(a, b) \ - __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) - -static inline void __o2info_set_request_filled(struct ocfs2_info_request *req) +static inline void o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } -#define o2info_set_request_filled(a) \ - __o2info_set_request_filled((struct ocfs2_info_request *)&(a)) - -static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req) +static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } -#define o2info_clear_request_filled(a) \ - __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) +static inline int o2info_coherent(struct ocfs2_info_request *req) +{ + return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); +} static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { @@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, oib.ib_blocksize = inode->i_sb->s_blocksize; - o2info_set_request_filled(oib); + o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) goto bail; @@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oib, req); + o2info_set_request_error(&oib.ib_req, req); return status; } @@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, oic.ic_clustersize = osb->s_clustersize; - o2info_set_request_filled(oic); + o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) goto bail; @@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oic, req); + o2info_set_request_error(&oic.ic_req, req); return status; } @@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, oim.im_max_slots = osb->max_slots; - o2info_set_request_filled(oim); + o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) goto bail; @@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oim, req); + o2info_set_request_error(&oim.im_req, req); return status; } @@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode, memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); - o2info_set_request_filled(oil); + o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) goto bail; @@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oil, req); + o2info_set_request_error(&oil.il_req, req); return status; } @@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); - o2info_set_request_filled(oiu); + o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) goto bail; @@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oiu, req); + o2info_set_request_error(&oiu.iu_req, req); return status; } @@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; - o2info_set_request_filled(oif); + o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) goto bail; @@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oif, req); + o2info_set_request_error(&oif.if_req, req); return status; } @@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, oij.ij_journal_size = osb->journal->j_inode->i_size; - o2info_set_request_filled(oij); + o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) goto bail; @@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oij, req); + o2info_set_request_error(&oij.ij_req, req); + + return status; +} + +int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, u32 slot) +{ + int status = 0, unlock = 0; + + struct buffer_head *bh = NULL; + struct ocfs2_dinode *dinode_alloc = NULL; + + if (inode_alloc) + mutex_lock(&inode_alloc->i_mutex); + + if (o2info_coherent(&fi->ifi_req)) { + status = ocfs2_inode_lock(inode_alloc, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + dinode_alloc = (struct ocfs2_dinode *)bh->b_data; + + fi->ifi_stat[slot].lfi_total = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); + fi->ifi_stat[slot].lfi_free = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - + le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); + +bail: + if (unlock) + ocfs2_inode_unlock(inode_alloc, 0); + + if (inode_alloc) + mutex_unlock(&inode_alloc->i_mutex); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u32 i; + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + struct ocfs2_info_freeinode *oifi = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *inode_alloc = NULL; + + oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); + if (!oifi) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oifi, req)) + goto bail; + + oifi->ifi_slotnum = osb->max_slots; + + for (i = 0; i < oifi->ifi_slotnum; i++) { + if (o2info_coherent(&oifi->ifi_req)) { + inode_alloc = ocfs2_get_system_file_inode(osb, type, i); + if (!inode_alloc) { + mlog(ML_ERROR, "unable to get alloc inode in " + "slot %u\n", i); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); + if (status < 0) + goto bail; + + iput(inode_alloc); + inode_alloc = NULL; + } + + o2info_set_request_filled(&oifi->ifi_req); + + if (o2info_to_user(*oifi, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oifi->ifi_req, req); + + kfree(oifi); + + return status; +} + +static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, + unsigned int chunksize) +{ + int index; + + index = __ilog2_u32(chunksize); + if (index >= OCFS2_INFO_MAX_HIST) + index = OCFS2_INFO_MAX_HIST - 1; + + hist->fc_chunks[index]++; + hist->fc_clusters[index] += chunksize; +} + +static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, + unsigned int chunksize) +{ + if (chunksize > stats->ffs_max) + stats->ffs_max = chunksize; + + if (chunksize < stats->ffs_min) + stats->ffs_min = chunksize; + + stats->ffs_avg += chunksize; + stats->ffs_free_chunks_real++; +} + +void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) +{ + o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); + o2ffg_update_stats(&(ffg->iff_ffs), chunksize); +} + +int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) +{ + int status = 0, used; + u64 blkno; + + struct buffer_head *bh = NULL; + struct ocfs2_group_desc *bg = NULL; + + unsigned int max_bits, num_clusters; + unsigned int offset = 0, cluster, chunk; + unsigned int chunk_free, last_chunksize = 0; + + if (!le32_to_cpu(rec->c_free)) + goto bail; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (bh) { + brelse(bh); + bh = NULL; + } + + if (o2info_coherent(&ffg->iff_req)) + status = ocfs2_read_group_descriptor(gb_inode, + gb_dinode, + blkno, &bh); + else + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + + if (status < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # " + "%llu from device.", (unsigned long long)blkno); + status = -EIO; + goto bail; + } + + bg = (struct ocfs2_group_desc *)bh->b_data; + + if (!le16_to_cpu(bg->bg_free_bits_count)) + continue; + + max_bits = le16_to_cpu(bg->bg_bits); + offset = 0; + + for (chunk = 0; chunk < chunks_in_group; chunk++) { + /* + * last chunk may be not an entire one. + */ + if ((offset + ffg->iff_chunksize) > max_bits) + num_clusters = max_bits - offset; + else + num_clusters = ffg->iff_chunksize; + + chunk_free = 0; + for (cluster = 0; cluster < num_clusters; cluster++) { + used = ocfs2_test_bit(offset, + (unsigned long *)bg->bg_bitmap); + /* + * - chunk_free counts free clusters in #N chunk. + * - last_chunksize records the size(in) clusters + * for the last real free chunk being counted. + */ + if (!used) { + last_chunksize++; + chunk_free++; + } + + if (used && last_chunksize) { + ocfs2_info_update_ffg(ffg, + last_chunksize); + last_chunksize = 0; + } + + offset++; + } + + if (chunk_free == ffg->iff_chunksize) + ffg->iff_ffs.ffs_free_chunks++; + } + + /* + * need to update the info for last free chunk. + */ + if (last_chunksize) + ocfs2_info_update_ffg(ffg, last_chunksize); + + } while (le64_to_cpu(bg->bg_next_group)); + +bail: + brelse(bh); + + return status; +} + +int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) +{ + u32 chunks_in_group; + int status = 0, unlock = 0, i; + + struct buffer_head *bh = NULL; + struct ocfs2_chain_list *cl = NULL; + struct ocfs2_chain_rec *rec = NULL; + struct ocfs2_dinode *gb_dinode = NULL; + + if (gb_inode) + mutex_lock(&gb_inode->i_mutex); + + if (o2info_coherent(&ffg->iff_req)) { + status = ocfs2_inode_lock(gb_inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + gb_dinode = (struct ocfs2_dinode *)bh->b_data; + cl = &(gb_dinode->id2.i_chain); + + /* + * Chunksize(in) clusters from userspace should be + * less than clusters in a group. + */ + if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { + status = -EINVAL; + goto bail; + } + + memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); + + ffg->iff_ffs.ffs_min = ~0U; + ffg->iff_ffs.ffs_clusters = + le32_to_cpu(gb_dinode->id1.bitmap1.i_total); + ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - + le32_to_cpu(gb_dinode->id1.bitmap1.i_used); + + chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + rec = &(cl->cl_recs[i]); + status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, + gb_dinode, + rec, ffg, + chunks_in_group); + if (status) + goto bail; + } + + if (ffg->iff_ffs.ffs_free_chunks_real) + ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / + ffg->iff_ffs.ffs_free_chunks_real); +bail: + if (unlock) + ocfs2_inode_unlock(gb_inode, 0); + + if (gb_inode) + mutex_unlock(&gb_inode->i_mutex); + + if (gb_inode) + iput(gb_inode); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + + struct ocfs2_info_freefrag *oiff; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *gb_inode = NULL; + + oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); + if (!oiff) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oiff, req)) + goto bail; + /* + * chunksize from userspace should be power of 2. + */ + if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || + (!oiff->iff_chunksize)) { + status = -EINVAL; + goto bail; + } + + if (o2info_coherent(&oiff->iff_req)) { + gb_inode = ocfs2_get_system_file_inode(osb, type, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, + OCFS2_INVALID_SLOT); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); + if (status < 0) + goto bail; + + o2info_set_request_filled(&oiff->iff_req); + + if (o2info_to_user(*oiff, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oiff->iff_req, req); + + kfree(oiff); return status; } @@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, if (o2info_from_user(oir, req)) goto bail; - o2info_clear_request_filled(oir); + o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) goto bail; @@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oir, req); + o2info_set_request_error(&oir, req); return status; } @@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode, if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; + case OCFS2_INFO_FREEINODE: + if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) + status = ocfs2_info_handle_freeinode(inode, req); + break; + case OCFS2_INFO_FREEFRAG: + if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) + status = ocfs2_info_handle_freefrag(inode, req); + break; default: status = ocfs2_info_handle_unknown(inode, req); break; @@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ocfs2_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + case OCFS2_IOC_MOVE_EXT: + return ocfs2_ioctl_move_extents(filp, (void __user *)arg); default: return -ENOTTY; } @@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, (struct reflink_arguments *)arg, @@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); + case OCFS2_IOC_MOVE_EXT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c new file mode 100644 index 00000000000..cd9427023d2 --- /dev/null +++ b/fs/ocfs2/move_extents.c @@ -0,0 +1,1152 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.c + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/mount.h> +#include <linux/swap.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_ioctl.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "uptodate.h" +#include "super.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "sysfile.h" +#include "suballoc.h" +#include "refcounttree.h" +#include "move_extents.h" + +struct ocfs2_move_extents_context { + struct inode *inode; + struct file *file; + int auto_defrag; + int partial; + int credits; + u32 new_phys_cpos; + u32 clusters_moved; + u64 refcount_loc; + struct ocfs2_move_extents *range; + struct ocfs2_extent_tree et; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; +}; + +static int __ocfs2_move_extent(handle_t *handle, + struct ocfs2_move_extents_context *context, + u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, + int ext_flags) +{ + int ret = 0, index; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec *rec, replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); + u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); + + ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + p_cpos, new_p_cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + new_p_cpos)); + + path = ocfs2_new_path_from_et(&context->et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + rec = &el->l_recs[index]; + + BUG_ON(ext_flags != rec->e_flags); + /* + * after moving/defraging to new location, the extent is not going + * to be refcounted anymore. + */ + replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + context->et.et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_split_extent(handle, &context->et, path, index, + &replace_rec, context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, context->et.et_root_bh); + + context->new_phys_cpos = new_p_cpos; + + /* + * need I to append truncate log for old clusters? + */ + if (old_blkno) { + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + old_blkno), + len, context->meta_ac, + &context->dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + old_blkno, len); + } + +out: + return ret; +} + +/* + * lock allocators, and reserving appropriate number of bits for + * meta blocks and data clusters. + * + * in some cases, we don't need to reserve clusters, just let data_ac + * be NULL. + */ +static int ocfs2_lock_allocators_move_extents(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_move, + u32 extents_to_split, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int extra_blocks, + int *credits) +{ + int ret, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, + clusters_to_move + 2); + + mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", + extra_blocks, clusters_to_move, *credits); +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +/* + * Using one journal handle to guarantee the data consistency in case + * crash happens anywhere. + * + * XXX: defrag can end up with finishing partial extent as requested, + * due to not enough contiguous clusters can be found in allocator. + */ +static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, partial = context->partial; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 new_phys_cpos, new_len; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + *len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, + &context->meta_ac, + &context->data_ac, + extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * should be using allocation reservation strategy there? + * + * if (context->data_ac) + * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + */ + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock_mutex; + } + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_mutex; + } + + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, + &new_phys_cpos, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * allowing partial extent moving is kind of 'pros and cons', it makes + * whole defragmentation less likely to fail, on the contrary, the bad + * thing is it may make the fs even more fragmented after moving, let + * userspace make a good decision here. + */ + if (new_len != *len) { + mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); + if (!partial) { + context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; + ret = -ENOSPC; + goto out_commit; + } + } + + mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, + phys_cpos, new_phys_cpos); + + ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, + new_phys_cpos, ext_flags); + if (ret) + mlog_errno(ret); + + if (partial && (new_len != *len)) + *len = new_len; + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock_mutex: + mutex_unlock(&tl_inode->i_mutex); + + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * find the victim alloc group, where #blkno fits. + */ +static int ocfs2_find_victim_alloc_group(struct inode *inode, + u64 vict_blkno, + int type, int slot, + int *vict_bit, + struct buffer_head **ret_bh) +{ + int ret, i, bits_per_unit = 0; + u64 blkno; + char namebuf[40]; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ac_bh = NULL, *gd_bh = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *rec; + struct ocfs2_dinode *ac_dinode; + struct ocfs2_group_desc *bg; + + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (ret) { + ret = -ENOENT; + goto out; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; + cl = &(ac_dinode->id2.i_chain); + rec = &(cl->cl_recs[0]); + + if (type == GLOBAL_BITMAP_SYSTEM_INODE) + bits_per_unit = osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits; + /* + * 'vict_blkno' was out of the valid range. + */ + if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || + (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + bits_per_unit))) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + + rec = &(cl->cl_recs[i]); + if (!rec) + continue; + + bg = NULL; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (gd_bh) { + brelse(gd_bh); + gd_bh = NULL; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + + le16_to_cpu(bg->bg_bits))) { + + *ret_bh = gd_bh; + *vict_bit = (vict_blkno - blkno) >> + bits_per_unit; + mlog(0, "find the victim group: #%llu, " + "total_bits: %u, vict_bit: %u\n", + blkno, le16_to_cpu(bg->bg_bits), + *vict_bit); + goto out; + } + + } while (le64_to_cpu(bg->bg_next_group)); + } + + ret = -EINVAL; +out: + brelse(ac_bh); + + /* + * caller has to release the gd_bh properly. + */ + return ret; +} + +/* + * XXX: helper to validate and adjust moving goal. + */ +static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, + struct ocfs2_move_extents *range) +{ + int ret, goal_bit = 0; + + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *bg = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int c_to_b = 1 << (osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits); + + /* + * make goal become cluster aligned. + */ + range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, + range->me_goal); + /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* + * validate goal sits within global_bitmap, and return the victim + * group desc + */ + ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) + goto out; + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + /* + * movement is not gonna cross two groups. + */ + if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < + range->me_len) { + ret = -EINVAL; + goto out; + } + /* + * more exact validations/adjustments will be performed later during + * moving operation for each extent range. + */ + mlog(0, "extents get ready to be moved to #%llu block\n", + range->me_goal); + +out: + brelse(gd_bh); + + return ret; +} + +static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, + int *goal_bit, u32 move_len, u32 max_hop, + u32 *phys_cpos) +{ + int i, used, last_free_bits = 0, base_bit = *goal_bit; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + + for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { + + used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); + if (used) { + /* + * we even tried searching the free chunk by jumping + * a 'max_hop' distance, but still failed. + */ + if ((i - base_bit) > max_hop) { + *phys_cpos = 0; + break; + } + + if (last_free_bits) + last_free_bits = 0; + + continue; + } else + last_free_bits++; + + if (last_free_bits == move_len) { + *goal_bit = i; + *phys_cpos = base_cpos + i; + break; + } + } + + mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); +} + +static int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = + (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + ocfs2_journal_dirty(handle, di_bh); + +out: + return ret; +} + +static inline int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, + num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } + while (num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + ocfs2_journal_dirty(handle, group_bh); + +bail: + return status; +} + +static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, + u32 len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, goal_bit = 0; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *gb_inode = NULL; + struct buffer_head *gb_bh = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *gd; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, + context->range->me_threshold); + u64 phys_blkno, new_phys_blkno; + + phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, + &context->meta_ac, + NULL, extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * need to count 2 extra credits for global_bitmap inode and + * group descriptor. + */ + credits += OCFS2_INODE_UPDATE_CREDITS + 1; + + /* + * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() + * logic, while we still need to lock the global_bitmap. + */ + gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + ret = -EIO; + goto out; + } + + mutex_lock(&gb_inode->i_mutex); + + ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock_gb_mutex; + } + + mutex_lock(&tl_inode->i_mutex); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_tl_inode; + } + + new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); + ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * probe the victim cluster group to find a proper + * region to fit wanted movement, it even will perfrom + * a best-effort attempt by compromising to a threshold + * around the goal. + */ + ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, + new_phys_cpos); + if (!new_phys_cpos) { + ret = -ENOSPC; + goto out_commit; + } + + ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, + *new_phys_cpos, ext_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, + le16_to_cpu(gd->bg_chain)); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, + goal_bit, len); + if (ret) + mlog_errno(ret); + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + brelse(gd_bh); + +out_unlock_tl_inode: + mutex_unlock(&tl_inode->i_mutex); + + ocfs2_inode_unlock(gb_inode, 1); +out_unlock_gb_mutex: + mutex_unlock(&gb_inode->i_mutex); + brelse(gb_bh); + iput(gb_inode); + +out: + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * Helper to calculate the defraging length in one run according to threshold. + */ +static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, + u32 threshold, int *skip) +{ + if ((*alloc_size + *len_defraged) < threshold) { + /* + * proceed defragmentation until we meet the thresh + */ + *len_defraged += *alloc_size; + } else if (*len_defraged == 0) { + /* + * XXX: skip a large extent. + */ + *skip = 1; + } else { + /* + * split this extent to coalesce with former pieces as + * to reach the threshold. + * + * we're done here with one cycle of defragmentation + * in a size of 'thresh', resetting 'len_defraged' + * forces a new defragmentation. + */ + *alloc_size = threshold - *len_defraged; + *len_defraged = 0; + } +} + +static int __ocfs2_move_extents_range(struct buffer_head *di_bh, + struct ocfs2_move_extents_context *context) +{ + int ret = 0, flags, do_defrag, skip = 0; + u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; + u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; + + struct inode *inode = context->inode; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_move_extents *range = context->range; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if ((inode->i_size == 0) || (range->me_len == 0)) + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + context->refcount_loc = le64_to_cpu(di->i_refcount_loc); + + ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&context->dealloc); + + /* + * TO-DO XXX: + * + * - xattr extents. + */ + + do_defrag = context->auto_defrag; + + /* + * extents moving happens in unit of clusters, for the sake + * of simplicity, we may ignore two clusters where 'byte_start' + * and 'byte_start + len' were within. + */ + move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); + len_to_move = (range->me_start + range->me_len) >> + osb->s_clustersize_bits; + if (len_to_move >= move_start) + len_to_move -= move_start; + else + len_to_move = 0; + + if (do_defrag) { + defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; + if (defrag_thresh <= 1) + goto done; + } else + new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + range->me_goal); + + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " + "thresh: %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range->me_start, + (unsigned long long)range->me_len, + move_start, len_to_move, defrag_thresh); + + cpos = move_start; + while (len_to_move) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, + &flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > len_to_move) + alloc_size = len_to_move; + + /* + * XXX: how to deal with a hole: + * + * - skip the hole of course + * - force a new defragmentation + */ + if (!phys_cpos) { + if (do_defrag) + len_defraged = 0; + + goto next; + } + + if (do_defrag) { + ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, + defrag_thresh, &skip); + /* + * skip large extents + */ + if (skip) { + skip = 0; + goto next; + } + + mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " + "alloc_size: %u, len_defraged: %u\n", + cpos, phys_cpos, alloc_size, len_defraged); + + ret = ocfs2_defrag_extent(context, cpos, phys_cpos, + &alloc_size, flags); + } else { + ret = ocfs2_move_extent(context, cpos, phys_cpos, + &new_phys_cpos, alloc_size, + flags); + + new_phys_cpos += alloc_size; + } + + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + context->clusters_moved += alloc_size; +next: + cpos += alloc_size; + len_to_move -= alloc_size; + } + +done: + range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; + +out: + range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, + context->clusters_moved); + range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, + context->new_phys_cpos); + + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + + return ret; +} + +static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) +{ + int status; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_dinode *di; + struct buffer_head *di_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!inode) + return -ENOENT; + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + mutex_lock(&inode->i_mutex); + + /* + * This prevents concurrent writes from other nodes + */ + status = ocfs2_rw_lock(inode, 1); + if (status) { + mlog_errno(status); + goto out; + } + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status) { + mlog_errno(status); + goto out_rw_unlock; + } + + /* + * rememer ip_xattr_sem also needs to be held if necessary + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + status = __ocfs2_move_extents_range(di_bh, context); + + up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (status) { + mlog_errno(status); + goto out_inode_unlock; + } + + /* + * We update ctime for these changes + */ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_inode_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_inode_unlock: + brelse(di_bh); + ocfs2_inode_unlock(inode, 1); +out_rw_unlock: + ocfs2_rw_unlock(inode, 1); +out: + mutex_unlock(&inode->i_mutex); + + return status; +} + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) +{ + int status; + + struct inode *inode = filp->f_path.dentry->d_inode; + struct ocfs2_move_extents range; + struct ocfs2_move_extents_context *context = NULL; + + status = mnt_want_write(filp->f_path.mnt); + if (status) + return status; + + if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) + goto out; + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + status = -EPERM; + goto out; + } + + context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); + if (!context) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + + context->inode = inode; + context->file = filp; + + if (argp) { + if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, + sizeof(range))) { + status = -EFAULT; + goto out; + } + } else { + status = -EINVAL; + goto out; + } + + if (range.me_start > i_size_read(inode)) + goto out; + + if (range.me_start + range.me_len > i_size_read(inode)) + range.me_len = i_size_read(inode) - range.me_start; + + context->range = ⦥ + + if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { + context->auto_defrag = 1; + /* + * ok, the default theshold for the defragmentation + * is 1M, since our maximum clustersize was 1M also. + * any thought? + */ + if (!range.me_threshold) + range.me_threshold = 1024 * 1024; + + if (range.me_threshold > i_size_read(inode)) + range.me_threshold = i_size_read(inode); + + if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) + context->partial = 1; + } else { + /* + * first best-effort attempt to validate and adjust the goal + * (physical address in block), while it can't guarantee later + * operation can succeed all the time since global_bitmap may + * change a bit over time. + */ + + status = ocfs2_validate_and_adjust_move_goal(inode, &range); + if (status) + goto out; + } + + status = ocfs2_move_extents(context); + if (status) + mlog_errno(status); +out: + /* + * movement/defragmentation may end up being partially completed, + * that's the reason why we need to return userspace the finished + * length and new_offset even if failure happens somewhere. + */ + if (argp) { + if (copy_to_user((struct ocfs2_move_extents *)argp, &range, + sizeof(range))) + status = -EFAULT; + } + + kfree(context); + + mnt_drop_write(filp->f_path.mnt); + + return status; +} diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h new file mode 100644 index 00000000000..4e143e81144 --- /dev/null +++ b/fs/ocfs2/move_extents.h @@ -0,0 +1,22 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.h + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_MOVE_EXTENTS_H +#define OCFS2_MOVE_EXTENTS_H + +int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp); + +#endif /* OCFS2_MOVE_EXTENTS_H */ diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index b46f39bf743..5b27ff1fa57 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -142,6 +142,38 @@ struct ocfs2_info_journal_size { __u64 ij_journal_size; }; +struct ocfs2_info_freeinode { + struct ocfs2_info_request ifi_req; + struct ocfs2_info_local_freeinode { + __u64 lfi_total; + __u64 lfi_free; + } ifi_stat[OCFS2_MAX_SLOTS]; + __u32 ifi_slotnum; /* out */ + __u32 ifi_pad; +}; + +#define OCFS2_INFO_MAX_HIST (32) + +struct ocfs2_info_freefrag { + struct ocfs2_info_request iff_req; + struct ocfs2_info_freefrag_stats { /* (out) */ + struct ocfs2_info_free_chunk_list { + __u32 fc_chunks[OCFS2_INFO_MAX_HIST]; + __u32 fc_clusters[OCFS2_INFO_MAX_HIST]; + } ffs_fc_hist; + __u32 ffs_clusters; + __u32 ffs_free_clusters; + __u32 ffs_free_chunks; + __u32 ffs_free_chunks_real; + __u32 ffs_min; /* Minimum free chunksize in clusters */ + __u32 ffs_max; + __u32 ffs_avg; + __u32 ffs_pad; + } iff_ffs; + __u32 iff_chunksize; /* chunksize in clusters(in) */ + __u32 iff_pad; +}; + /* Codes for ocfs2_info_request */ enum ocfs2_info_type { OCFS2_INFO_CLUSTERSIZE = 1, @@ -151,6 +183,8 @@ enum ocfs2_info_type { OCFS2_INFO_UUID, OCFS2_INFO_FS_FEATURES, OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_FREEINODE, + OCFS2_INFO_FREEFRAG, OCFS2_INFO_NUM_TYPES }; @@ -171,4 +205,38 @@ enum ocfs2_info_type { #define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) +struct ocfs2_move_extents { +/* All values are in bytes */ + /* in */ + __u64 me_start; /* Virtual start in the file to move */ + __u64 me_len; /* Length of the extents to be moved */ + __u64 me_goal; /* Physical offset of the goal, + it's in block unit */ + __u64 me_threshold; /* Maximum distance from goal or threshold + for auto defragmentation */ + __u64 me_flags; /* Flags for the operation: + * - auto defragmentation. + * - refcount,xattr cases. + */ + /* out */ + __u64 me_moved_len; /* Moved/defraged length */ + __u64 me_new_offset; /* Resulting physical location */ + __u32 me_reserved[2]; /* Reserved for futhure */ +}; + +#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to + claim new clusters + as the goal place + for extents moving */ +#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent + moving, is to make + movement less likely + to fail, may make fs + even more fragmented */ +#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation + completely gets done. + */ + +#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents) + #endif /* OCFS2_IOCTL_H */ diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a1dae5bb54a..3b481f49063 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc, __entry->blkno, __entry->bit) ); +TRACE_EVENT(ocfs2_trim_extent, + TP_PROTO(struct super_block *sb, unsigned long long blk, + unsigned long long count), + TP_ARGS(sb, blk, count), + TP_STRUCT__entry( + __field(int, dev_major) + __field(int, dev_minor) + __field(unsigned long long, blk) + __field(__u64, count) + ), + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->blk = blk; + __entry->count = count; + ), + TP_printk("%d %d %llu %llu", + __entry->dev_major, __entry->dev_minor, + __entry->blk, __entry->count) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); + /* End of trace events for fs/ocfs2/alloc.c. */ /* Trace events for fs/ocfs2/localalloc.c. */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5d32749c896..ebfd3825f12 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -66,7 +66,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct ocfs2_cow_context *context, + struct file *file, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) return 0; } -static int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct ocfs2_cow_context *context, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len) +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct ocfs2_caching_info *ci = context->data_et.et_ci; + struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_caching_info *ci = INODE_CACHE(inode); struct super_block *sb = ocfs2_metadata_cache_get_super(ci); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; unsigned int from, to, readahead_pages; loff_t offset, end, map_end; - struct address_space *mapping = context->inode->i_mapping; + struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); @@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, * We only duplicate pages until we reach the page contains i_size - 1. * So trim 'end' to i_size. */ - if (end > i_size_read(context->inode)) - end = i_size_read(context->inode); + if (end > i_size_read(inode)) + end = i_size_read(inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); - if (PageReadahead(page) && context->file) { + if (PageReadahead(page)) { page_cache_async_readahead(mapping, - &context->file->f_ra, - context->file, + &file->f_ra, file, page, page_index, readahead_pages); } @@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(context->inode, - handle, from, to, + ocfs2_map_and_dirty_page(inode, handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: @@ -3015,14 +3014,15 @@ unlock: return ret; } -static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct ocfs2_cow_context *context, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len) +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) { int ret = 0; - struct super_block *sb = context->inode->i_sb; - struct ocfs2_caching_info *ci = context->data_et.et_ci; + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); @@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context, cpos, - old, new, len); + ret = context->cow_duplicate_clusters(handle, context->file, + cpos, old, new, len); if (ret) { mlog_errno(ret); goto out; @@ -3162,22 +3162,22 @@ out: return ret; } -static int ocfs2_cow_sync_writeback(struct super_block *sb, - struct ocfs2_cow_context *context, - u32 cpos, u32 num_clusters) +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters) { int ret = 0; loff_t offset, end, map_end; pgoff_t page_index; struct page *page; - if (ocfs2_should_order_data(context->inode)) + if (ocfs2_should_order_data(inode)) return 0; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); - ret = filemap_fdatawrite_range(context->inode->i_mapping, + ret = filemap_fdatawrite_range(inode->i_mapping, offset, end - 1); if (ret < 0) { mlog_errno(ret); @@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = find_or_create_page(context->inode->i_mapping, + page = find_or_create_page(inode->i_mapping, page_index, GFP_NOFS); BUG_ON(!page); @@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, + ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, orig_num_clusters); if (ret) mlog_errno(ret); @@ -3706,7 +3706,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; - context->ref_root_bh = ref_root_bh;; + context->ref_root_bh = ref_root_bh; context->cow_object = xv; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index c8ce46f7d8e..7754608c83a 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, struct buffer_head *ref_root_bh, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); +int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct file *file, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +int ocfs2_cow_sync_writeback(struct super_block *sb, + struct inode *inode, + u32 cpos, u32 num_clusters); int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_extent_tree *data_et, struct ocfs2_caching_info *ref_ci, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5a521c74885..cdbaf5e9730 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,6 +41,7 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> +#include <linux/cleancache.h> #define CREATE_TRACE_POINTS #include "ocfs2_trace.h" @@ -1566,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->preferred_slot != OCFS2_INVALID_SLOT) seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); - if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) + if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME)) seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); if (osb->osb_commit_interval) @@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } + cleancache_init_shared_fs((char *)&uuid_net_key, sb); bail: return status; diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index de4ff29f1e0..c368360c35a 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) - return -ENOTEMPTY; + + if (S_ISDIR(inode->i_mode)) { + dentry_unhash(dentry); + if (!omfs_dir_is_empty(inode)) + return -ENOTEMPTY; + } ret = omfs_delete_entry(dentry); if (ret) @@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, int err; if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + /* overwriting existing file/dir */ err = omfs_remove(new_dir, new_dentry); if (err) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97d99c..8ed4d343319 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%u\n", + queue_limit_discard_alignment(&disk->queue->limits, + p->start_sect)); } ssize_t part_stat_show(struct device *dev, @@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 19d6750d1d6..6296b403c67 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -310,6 +310,15 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Check the GUID Partition Table header size */ + if (le32_to_cpu((*gpt)->header_size) > + bdev_logical_block_size(state->bdev)) { + pr_debug("GUID Partition Table Header size is wrong: %u > %u\n", + le32_to_cpu((*gpt)->header_size), + bdev_logical_block_size(state->bdev)); + goto fail; + } + /* Check the GUID Partition Table CRC */ origcrc = le32_to_cpu((*gpt)->header_crc32); (*gpt)->header_crc32 = 0; diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index a29d5ccf3d5..af9fdf04676 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -565,7 +565,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) data = read_part_sector(state, 0, §); if (!data) { - ldm_crit ("Disk read failed."); + ldm_info ("Disk read failed."); return false; } diff --git a/fs/proc/Makefile b/fs/proc/Makefile index df434c5f28f..c1c72933592 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -20,6 +20,7 @@ proc-y += stat.o proc-y += uptime.o proc-y += version.o proc-y += softirqs.o +proc-y += namespaces.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 5e4f776b091..9b45ee84fbc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -131,7 +131,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * you can test for combinations of others with * simple bit tests. */ -static const char *task_state_array[] = { +static const char * const task_state_array[] = { "R (running)", /* 0 */ "S (sleeping)", /* 1 */ "D (disk sleep)", /* 2 */ @@ -147,7 +147,7 @@ static const char *task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; - const char **p = &task_state_array[0]; + const char * const *p = &task_state_array[0]; BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); diff --git a/fs/proc/base.c b/fs/proc/base.c index dfa532730e5..4ede550517a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -static int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = dentry->d_inode; @@ -894,20 +894,20 @@ static ssize_t mem_write(struct file * file, const char __user *buf, if (!task) goto out_no_task; + copied = -ENOMEM; + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out_task; + mm = check_mem_permission(task); copied = PTR_ERR(mm); if (IS_ERR(mm)) - goto out_task; + goto out_free; copied = -EIO; if (file->private_data != (void *)((long)current->self_exec_id)) goto out_mm; - copied = -ENOMEM; - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - goto out_mm; - copied = 0; while (count > 0) { int this_len, retval; @@ -929,9 +929,11 @@ static ssize_t mem_write(struct file * file, const char __user *buf, count -= retval; } *ppos = dst; - free_page((unsigned long) page); + out_mm: mmput(mm); +out_free: + free_page((unsigned long) page); out_task: put_task_struct(task); out_no_task: @@ -1059,7 +1061,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, { struct task_struct *task; char buffer[PROC_NUMBUF]; - long oom_adjust; + int oom_adjust; unsigned long flags; int err; @@ -1071,7 +1073,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto out; } - err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + err = kstrtoint(strstrip(buffer), 0, &oom_adjust); if (err) goto out; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && @@ -1168,7 +1170,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, struct task_struct *task; char buffer[PROC_NUMBUF]; unsigned long flags; - long oom_score_adj; + int oom_score_adj; int err; memset(buffer, 0, sizeof(buffer)); @@ -1179,7 +1181,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || @@ -1468,7 +1470,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, struct inode *inode = file->f_path.dentry->d_inode; struct task_struct *p; char buffer[PROC_NUMBUF]; - long nice; + int nice; int err; memset(buffer, 0, sizeof(buffer)); @@ -1477,9 +1479,9 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (copy_from_user(buffer, buf, count)) return -EFAULT; - err = strict_strtol(strstrip(buffer), 0, &nice); - if (err) - return -EINVAL; + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; p = get_proc_task(inode); if (!p) @@ -1576,57 +1578,6 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ - if (new_exe_file) - get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ - struct file *exe_file; - - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); - return exe_file; -} - -void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - static int proc_exe_link(struct inode *inode, struct path *exe_path) { struct task_struct *task; @@ -1736,8 +1687,7 @@ static int task_dumpable(struct task_struct *task) return 0; } - -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; @@ -1779,7 +1729,7 @@ out_unlock: return NULL; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; struct task_struct *task; @@ -1820,7 +1770,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat * made this apply to all per process world readable and executable * directories. */ -static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode; struct task_struct *task; @@ -1862,7 +1812,7 @@ static int pid_delete_dentry(const struct dentry * dentry) return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } -static const struct dentry_operations pid_dentry_operations = +const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, @@ -1870,9 +1820,6 @@ static const struct dentry_operations pid_dentry_operations = /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); - /* * Fill a directory entry. * @@ -1885,8 +1832,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, * reported by readdir in sync with the inode numbers reported * by stat. */ -static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - char *name, int len, +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = filp->f_path.dentry; @@ -2820,6 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif @@ -3168,6 +3116,7 @@ out_no_task: static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f1281339b6f..f1637f17c37 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, } return ent; } +EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, struct proc_dir_entry *parent) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d15aa1b1cc8..74b48cfa1bb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode) rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } + /* Release any associated namespace */ + ns_ops = PROC_I(inode)->ns_ops; + if (ns_ops && ns_ops->put) + ns_ops->put(PROC_I(inode)->ns); } static struct kmem_cache * proc_inode_cachep; @@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + ei->ns = NULL; + ei->ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c03e8d3a3a5..7838e5cfec1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +}; + void proc_init_inodecache(void); static inline struct pid *proc_pid(struct inode *inode) @@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); */ int proc_readdir(struct file *, void *, filldir_t); struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); + + + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr); +int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); +extern const struct dentry_operations pid_dentry_operations; +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int proc_setattr(struct dentry *dentry, struct iattr *attr); + +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 00000000000..781dec5bd68 --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,198 @@ +#include <linux/proc_fs.h> +#include <linux/nsproxy.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> +#include <linux/mnt_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/pid_namespace.h> +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +}; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static struct dentry *proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = S_IFREG|S_IRUSR; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns_ops->get(task); + if (!ei->ns) + goto out_iput; + + dentry->d_op = &pid_dentry_operations; + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static int proc_ns_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, + const struct proc_ns_operations *ops) +{ + return proc_fill_cache(filp, dirent, filldir, + ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops); +} + +static int proc_ns_dir_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct proc_ns_operations **entry, **last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = -EPERM; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= ARRAY_SIZE(ns_entries)) { + ret = 1; + goto out; + } + entry = ns_entries + i; + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + if (proc_ns_fill_cache(filp, dirent, filldir, + task, *entry) < 0) + goto out; + filp->f_pos++; + entry++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .readdir = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + error = ERR_PTR(-EPERM); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + for (entry = ns_entries; entry <= last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + error = ERR_PTR(-ENOENT); + if (entry > last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return error; +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 1cffa2b8a2f..9758b654a1b 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -138,9 +138,9 @@ static int stat_open(struct inode *inode, struct file *file) struct seq_file *m; int res; - /* don't ask for more than the kmalloc() max size, currently 128 KB */ - if (size > 128 * 1024) - size = 128 * 1024; + /* don't ask for more than the kmalloc() max size */ + if (size > KMALLOC_MAX_SIZE) + size = KMALLOC_MAX_SIZE; buf = kmalloc(size, GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 318d8654989..25b6a887adb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - int flags = vma->vm_flags; + vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; @@ -536,15 +536,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - long type; + int type; + int rv; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - if (strict_strtol(strstrip(buffer), 10, &type)) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 10, &type); + if (rv < 0) + return rv; if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) return -EINVAL; task = get_proc_task(file->f_path.dentry->d_inode); @@ -769,18 +771,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!task) goto out; - mm = mm_for_maps(task); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_task; - ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_task; ret = 0; - if (!count) goto out_task; @@ -788,7 +784,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_mm; + goto out_task; + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; @@ -831,7 +832,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_free; + goto out_mm; } copied += len; buf += len; @@ -841,10 +842,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_free: - kfree(pm.buffer); out_mm: mmput(mm); +out_free: + kfree(pm.buffer); out_task: put_task_struct(task); out: @@ -858,7 +859,192 @@ const struct file_operations proc_pagemap_operations = { #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); + +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +{ + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page) || PageUnevictable(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page; + int nid; + + if (!pte_present(*pte)) + continue; + + page = vm_normal_page(md->vma, addr, *pte); + if (!page) + continue; + + if (PageReserved(page)) + continue; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + continue; + + gather_stats(page, md, pte_dirty(*pte)); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (pte_none(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte)); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + + if (m->count < m->size) + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + return 0; +} static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, @@ -869,7 +1055,20 @@ static const struct seq_operations proc_pid_numa_maps_op = { static int numa_maps_open(struct inode *inode, struct file *file) { - return do_maps_open(inode, file, &proc_pid_numa_maps_op); + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_numa_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; } const struct file_operations proc_numa_maps_operations = { @@ -878,4 +1077,4 @@ const struct file_operations proc_numa_maps_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 74802bc5ded..cd99bf55765 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -35,6 +35,46 @@ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore = NULL; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (oldmem_pfn_is_ram) + return -EBUSY; + oldmem_pfn_is_ram = fn; + return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ + oldmem_pfn_is_ram = NULL; + wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ + int (*fn)(unsigned long pfn); + /* pfn is ram unless fn() checks pagetype */ + int ret = 1; + + /* + * Ask hypervisor if the pfn is really ram. + * A ballooned page contains no data and reading from such a page + * will cause high load in the hypervisor. + */ + fn = oldmem_pfn_is_ram; + if (fn) + ret = fn(pfn); + + return ret; +} + /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf) @@ -55,9 +95,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, else nr_bytes = count; - tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) - return tmp; + /* If pfn is not ram, return zeros for sparse dump files */ + if (pfn_is_ram(pfn) == 0) + memset(buf, 0, nr_bytes); + else { + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) + return tmp; + } *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d3c032f5fa0..5b572c89e6c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -691,8 +691,11 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + if (nr) { spin_lock(&dq_list_lock); prune_dqcache(nr); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 118662690cd..76c8164d565 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + dentry_unhash(dentry); + /* we will be doing 2 balancings and update 2 stat data, we change quotas * of the owner of the directory and of the owner of the parent directory. * The quota structure is possibly deleted only on last iput => outside @@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; + if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) + dentry_unhash(new_dentry); + /* three balancings: (1) old name removal, (2) new name insertion and (3) maybe "save" link insertion stat data updates: (1) old directory, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 47d2a4498b0..50f1abccd1c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) mutex_unlock(&dentry->d_inode->i_mutex); if (!error) d_delete(dentry); - dput(dentry); return error; } diff --git a/fs/splice.c b/fs/splice.c index 50a5d978da1..aa866d30969 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill @@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) + wakeup_pipe_readers(pipe); while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); @@ -1892,12 +1896,9 @@ retry: /* * If we put data in the output pipe, wakeup any potential readers. */ - if (ret > 0) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - } + if (ret > 0) + wakeup_pipe_readers(opipe); + if (input_wakeup) wakeup_pipe_writers(ipipe); @@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * If we put data in the output pipe, wakeup any potential readers. */ - if (ret > 0) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - } + if (ret > 0) + wakeup_pipe_readers(opipe); return ret; } diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index efc309fa303..7797218d0b3 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -42,7 +42,7 @@ config SQUASHFS_LZO select LZO_DECOMPRESS help Saying Y here includes support for reading Squashfs file systems - compressed with LZO compresssion. LZO compression is mainly + compressed with LZO compression. LZO compression is mainly aimed at embedded systems with slower CPUs where the overheads of zlib are too high. @@ -57,7 +57,7 @@ config SQUASHFS_XZ select XZ_DEC help Saying Y here includes support for reading Squashfs file systems - compressed with XZ compresssion. XZ gives better compression than + compressed with XZ compression. XZ gives better compression than the default zlib compression, at the expense of greater CPU and memory overhead. diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8ab48bc2fa7..ed0eb2a921f 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index c37b520132f..f744be98cd5 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -29,7 +29,7 @@ * plus functions layered ontop of the generic cache implementation to * access the metadata and fragment caches. * - * To avoid out of memory and fragmentation isssues with vmalloc the cache + * To avoid out of memory and fragmentation issues with vmalloc the cache * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. * * It should be noted that the cache is not used for file datablocks, these @@ -393,19 +393,36 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, /* * Read a filesystem table (uncompressed sequence of bytes) from disk */ -int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, - int length) +void *squashfs_read_table(struct super_block *sb, u64 block, int length) { int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; int i, res; - void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL); - if (data == NULL) - return -ENOMEM; + void *table, *buffer, **data; + + table = buffer = kmalloc(length, GFP_KERNEL); + if (table == NULL) + return ERR_PTR(-ENOMEM); + + data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) { + res = -ENOMEM; + goto failed; + } for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; + res = squashfs_read_data(sb, data, block, length | SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); + kfree(data); - return res; + + if (res < 0) + goto failed; + + return table; + +failed: + kfree(table); + return ERR_PTR(res); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index e921bd21373..9f1b0bb96f1 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 099745ad569..8ba70cff09a 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -4,7 +4,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 3f79cd1d0c1..9dfe2ce0fb7 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 7f93d5a9ee0..730c56248c9 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -121,30 +121,38 @@ static struct dentry *squashfs_get_parent(struct dentry *child) * Read uncompressed inode lookup table indexes off disk into memory */ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, - u64 lookup_table_start, unsigned int inodes) + u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); - __le64 *inode_lookup_table; - int err; + __le64 *table; TRACE("In read_inode_lookup_table, length %d\n", length); - /* Allocate inode lookup table indexes */ - inode_lookup_table = kmalloc(length, GFP_KERNEL); - if (inode_lookup_table == NULL) { - ERROR("Failed to allocate inode lookup table\n"); - return ERR_PTR(-ENOMEM); - } + /* Sanity check values */ + + /* there should always be at least one inode */ + if (inodes == 0) + return ERR_PTR(-EINVAL); + + /* length bytes should not extend into the next table - this check + * also traps instances where lookup_table_start is incorrectly larger + * than the next table start + */ + if (lookup_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, lookup_table_start, length); - err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start, - length); - if (err < 0) { - ERROR("unable to read inode lookup table\n"); - kfree(inode_lookup_table); - return ERR_PTR(err); + /* + * table[0] points to the first inode lookup table metadata block, + * this should be less than lookup_table_start + */ + if (!IS_ERR(table) && table[0] >= lookup_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); } - return inode_lookup_table; + return table; } diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a25c5060bdc..38bb1c64055 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index 7eef571443c..1516a6490bf 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -71,26 +71,29 @@ int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, * Read the uncompressed fragment lookup table indexes off disk into memory */ __le64 *squashfs_read_fragment_index_table(struct super_block *sb, - u64 fragment_table_start, unsigned int fragments) + u64 fragment_table_start, u64 next_table, unsigned int fragments) { unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); - __le64 *fragment_index; - int err; + __le64 *table; - /* Allocate fragment lookup table indexes */ - fragment_index = kmalloc(length, GFP_KERNEL); - if (fragment_index == NULL) { - ERROR("Failed to allocate fragment index table\n"); - return ERR_PTR(-ENOMEM); - } + /* + * Sanity check, length bytes should not extend into the next table - + * this check also traps instances where fragment_table_start is + * incorrectly larger than the next table start + */ + if (fragment_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, fragment_table_start, length); - err = squashfs_read_table(sb, fragment_index, fragment_table_start, - length); - if (err < 0) { - ERROR("unable to read fragment index table\n"); - kfree(fragment_index); - return ERR_PTR(err); + /* + * table[0] points to the first fragment table metadata block, this + * should be less than fragment_table_start + */ + if (!IS_ERR(table) && table[0] >= fragment_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); } - return fragment_index; + return table; } diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index d8f32452638..a70858e0fb4 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -66,27 +66,37 @@ int squashfs_get_id(struct super_block *sb, unsigned int index, * Read uncompressed id lookup table indexes from disk into memory */ __le64 *squashfs_read_id_index_table(struct super_block *sb, - u64 id_table_start, unsigned short no_ids) + u64 id_table_start, u64 next_table, unsigned short no_ids) { unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); - __le64 *id_table; - int err; + __le64 *table; TRACE("In read_id_index_table, length %d\n", length); - /* Allocate id lookup table indexes */ - id_table = kmalloc(length, GFP_KERNEL); - if (id_table == NULL) { - ERROR("Failed to allocate id index table\n"); - return ERR_PTR(-ENOMEM); - } + /* Sanity check values */ + + /* there should always be at least one id */ + if (no_ids == 0) + return ERR_PTR(-EINVAL); + + /* + * length bytes should not extend into the next table - this check + * also traps instances where id_table_start is incorrectly larger + * than the next table start + */ + if (id_table_start + length > next_table) + return ERR_PTR(-EINVAL); + + table = squashfs_read_table(sb, id_table_start, length); - err = squashfs_read_table(sb, id_table, id_table_start, length); - if (err < 0) { - ERROR("unable to read id index table\n"); - kfree(id_table); - return ERR_PTR(err); + /* + * table[0] points to the first id lookup table metadata block, this + * should be less than id_table_start + */ + if (!IS_ERR(table) && table[0] >= id_table_start) { + kfree(table); + return ERR_PTR(-EINVAL); } - return id_table; + return table; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 62e63ad2507..04bebcaa237 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 5d922a6701a..4bc63ac64bc 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 1f2e608b878..e3be6a71cfa 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,24 +44,24 @@ extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, u64, int); extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, u64, int); -extern int squashfs_read_table(struct super_block *, void *, u64, int); +extern void *squashfs_read_table(struct super_block *, u64, int); /* decompressor.c */ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); extern void *squashfs_decompressor_init(struct super_block *, unsigned short); /* export.c */ -extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, +extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, unsigned int); /* fragment.c */ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); extern __le64 *squashfs_read_fragment_index_table(struct super_block *, - u64, unsigned int); + u64, u64, unsigned int); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); -extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, +extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, unsigned short); /* inode.c */ diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 4582c568ef4..b4a4e539a08 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -4,7 +4,7 @@ * Squashfs * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 359baefc01f..73588e7700e 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -4,7 +4,7 @@ * Squashfs * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index d9037a5215f..651f0b31d29 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -4,7 +4,7 @@ * Squashfs * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 5c8184c061a..6f26abee359 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -83,7 +83,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) long long root_inode; unsigned short flags; unsigned int fragments; - u64 lookup_table_start, xattr_id_table_start; + u64 lookup_table_start, xattr_id_table_start, next_table; int err; TRACE("Entered squashfs_fill_superblock\n"); @@ -95,12 +95,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); - if (sblk == NULL) { - ERROR("Failed to allocate squashfs_super_block\n"); - goto failure; - } - msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); @@ -114,10 +108,12 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) * of bytes_used) we need to set it to an initial sensible dummy value */ msblk->bytes_used = sizeof(*sblk); - err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk)); + sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); - if (err < 0) { + if (IS_ERR(sblk)) { ERROR("unable to read squashfs_super_block\n"); + err = PTR_ERR(sblk); + sblk = NULL; goto failed_mount; } @@ -218,18 +214,61 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + /* Handle xattrs */ + sb->s_xattr = squashfs_xattr_handlers; + xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); + if (xattr_id_table_start == SQUASHFS_INVALID_BLK) { + next_table = msblk->bytes_used; + goto allocate_id_index_table; + } + + /* Allocate and read xattr id lookup table */ + msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, + xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); + if (IS_ERR(msblk->xattr_id_table)) { + ERROR("unable to read xattr id index table\n"); + err = PTR_ERR(msblk->xattr_id_table); + msblk->xattr_id_table = NULL; + if (err != -ENOTSUPP) + goto failed_mount; + } + next_table = msblk->xattr_table; + +allocate_id_index_table: /* Allocate and read id index table */ msblk->id_table = squashfs_read_id_index_table(sb, - le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); + le64_to_cpu(sblk->id_table_start), next_table, + le16_to_cpu(sblk->no_ids)); if (IS_ERR(msblk->id_table)) { + ERROR("unable to read id index table\n"); err = PTR_ERR(msblk->id_table); msblk->id_table = NULL; goto failed_mount; } + next_table = msblk->id_table[0]; + + /* Handle inode lookup table */ + lookup_table_start = le64_to_cpu(sblk->lookup_table_start); + if (lookup_table_start == SQUASHFS_INVALID_BLK) + goto handle_fragments; + + /* Allocate and read inode lookup table */ + msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, + lookup_table_start, next_table, msblk->inodes); + if (IS_ERR(msblk->inode_lookup_table)) { + ERROR("unable to read inode lookup table\n"); + err = PTR_ERR(msblk->inode_lookup_table); + msblk->inode_lookup_table = NULL; + goto failed_mount; + } + next_table = msblk->inode_lookup_table[0]; + sb->s_export_op = &squashfs_export_ops; + +handle_fragments: fragments = le32_to_cpu(sblk->fragments); if (fragments == 0) - goto allocate_lookup_table; + goto check_directory_table; msblk->fragment_cache = squashfs_cache_init("fragment", SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); @@ -240,45 +279,29 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) /* Allocate and read fragment index table */ msblk->fragment_index = squashfs_read_fragment_index_table(sb, - le64_to_cpu(sblk->fragment_table_start), fragments); + le64_to_cpu(sblk->fragment_table_start), next_table, fragments); if (IS_ERR(msblk->fragment_index)) { + ERROR("unable to read fragment index table\n"); err = PTR_ERR(msblk->fragment_index); msblk->fragment_index = NULL; goto failed_mount; } + next_table = msblk->fragment_index[0]; -allocate_lookup_table: - lookup_table_start = le64_to_cpu(sblk->lookup_table_start); - if (lookup_table_start == SQUASHFS_INVALID_BLK) - goto allocate_xattr_table; - - /* Allocate and read inode lookup table */ - msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, - lookup_table_start, msblk->inodes); - if (IS_ERR(msblk->inode_lookup_table)) { - err = PTR_ERR(msblk->inode_lookup_table); - msblk->inode_lookup_table = NULL; +check_directory_table: + /* Sanity check directory_table */ + if (msblk->directory_table >= next_table) { + err = -EINVAL; goto failed_mount; } - sb->s_export_op = &squashfs_export_ops; - -allocate_xattr_table: - sb->s_xattr = squashfs_xattr_handlers; - xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); - if (xattr_id_table_start == SQUASHFS_INVALID_BLK) - goto allocate_root; - - /* Allocate and read xattr id lookup table */ - msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, - xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); - if (IS_ERR(msblk->xattr_id_table)) { - err = PTR_ERR(msblk->xattr_id_table); - msblk->xattr_id_table = NULL; - if (err != -ENOTSUPP) - goto failed_mount; + /* Sanity check inode_table */ + if (msblk->inode_table >= msblk->directory_table) { + err = -EINVAL; + goto failed_mount; } -allocate_root: + + /* allocate root */ root = new_inode(sb); if (!root) { err = -ENOMEM; @@ -318,11 +341,6 @@ failed_mount: sb->s_fs_info = NULL; kfree(sblk); return err; - -failure: - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - return -ENOMEM; } @@ -475,5 +493,5 @@ static const struct super_operations squashfs_super_ops = { module_init(init_squashfs_fs); module_exit(exit_squashfs_fs); MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); -MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>"); +MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>"); MODULE_LICENSE("GPL"); diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index ec86434921e..1191817264c 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 3876c36699a..92fcde7b4d6 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index b634efce4bd..c83f5d9ec12 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -31,6 +31,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) { ERROR("Xattrs in filesystem, these will be ignored\n"); + *xattr_table_start = start; return ERR_PTR(-ENOTSUPP); } diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 05385dbe146..c89607d690c 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -67,34 +67,29 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) { unsigned int len; - __le64 *xid_table; - struct squashfs_xattr_id_table id_table; - int err; + struct squashfs_xattr_id_table *id_table; + + id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + if (IS_ERR(id_table)) + return (__le64 *) id_table; + + *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); + *xattr_ids = le32_to_cpu(id_table->xattr_ids); + kfree(id_table); + + /* Sanity check values */ + + /* there is always at least one xattr id */ + if (*xattr_ids == 0) + return ERR_PTR(-EINVAL); + + /* xattr_table should be less than start */ + if (*xattr_table_start >= start) + return ERR_PTR(-EINVAL); - err = squashfs_read_table(sb, &id_table, start, sizeof(id_table)); - if (err < 0) { - ERROR("unable to read xattr id table\n"); - return ERR_PTR(err); - } - *xattr_table_start = le64_to_cpu(id_table.xattr_table_start); - *xattr_ids = le32_to_cpu(id_table.xattr_ids); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); TRACE("In read_xattr_index_table, length %d\n", len); - /* Allocate xattr id lookup table indexes */ - xid_table = kmalloc(len, GFP_KERNEL); - if (xid_table == NULL) { - ERROR("Failed to allocate xattr id index table\n"); - return ERR_PTR(-ENOMEM); - } - - err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len); - if (err < 0) { - ERROR("unable to read xattr id index table\n"); - kfree(xid_table); - return ERR_PTR(err); - } - - return xid_table; + return squashfs_read_table(sb, start + sizeof(*id_table), len); } diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index aa47a286d1f..1760b7d108f 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 517688b32ff..55d918fd2d8 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -2,7 +2,7 @@ * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 - * Phillip Lougher <phillip@lougher.demon.co.uk> + * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/fs/super.c b/fs/super.c index c04f7e0b7ed..c75593953c5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -31,6 +31,7 @@ #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> +#include <linux/cleancache.h> #include "internal.h" @@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { + cleancache_flush_fs(s); fs->kill_sb(s); /* * We need to call rcu_barrier so all the delayed rcu free diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index e474fbcf8bd..e2cc6756f3b 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) struct inode *inode = dentry->d_inode; int err = -ENOTEMPTY; + dentry_unhash(dentry); + if (sysv_empty_dir(inode)) { err = sysv_unlink(dir, dentry); if (!err) { @@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct sysv_dir_entry * old_de; int err = -ENOENT; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + old_de = sysv_find_entry(old_dentry, &old_page); if (!old_de) goto out; diff --git a/fs/timerfd.c b/fs/timerfd.c index 8c4fc1425b3..f67acbdda5e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -22,16 +22,24 @@ #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> +#include <linux/rcupdate.h> struct timerfd_ctx { struct hrtimer tmr; ktime_t tintv; + ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int expired; int clockid; + struct rcu_head rcu; + struct list_head clist; + bool might_cancel; }; +static LIST_HEAD(cancel_list); +static DEFINE_SPINLOCK(cancel_lock); + /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, @@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; } +/* + * Called when the clock was set to cancel the timers in the cancel + * list. + */ +void timerfd_clock_was_set(void) +{ + ktime_t moffs = ktime_get_monotonic_offset(); + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs.tv64 != moffs.tv64) { + ctx->moffs.tv64 = KTIME_MAX; + wake_up_locked(&ctx->wqh); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); +} + +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } +} + +static bool timerfd_canceled(struct timerfd_ctx *ctx) +{ + if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) + return false; + ctx->moffs = ktime_get_monotonic_offset(); + return true; +} + +static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) +{ + if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && + (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } else if (ctx->might_cancel) { + timerfd_remove_cancel(ctx); + } +} + static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; @@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } -static void timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) +static int timerfd_setup(struct timerfd_ctx *ctx, int flags, + const struct itimerspec *ktmr) { enum hrtimer_mode htmode; ktime_t texp; + int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; @@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, ctx->clockid, htmode); + hrtimer_init(&ctx->tmr, clockid, htmode); hrtimer_set_expires(&ctx->tmr, texp); ctx->tmr.function = timerfd_tmrproc; - if (texp.tv64 != 0) + if (texp.tv64 != 0) { hrtimer_start(&ctx->tmr, texp, htmode); + if (timerfd_canceled(ctx)) + return -ECANCELED; + } + return 0; } static int timerfd_release(struct inode *inode, struct file *file) { struct timerfd_ctx *ctx = file->private_data; + timerfd_remove_cancel(ctx); hrtimer_cancel(&ctx->tmr); - kfree(ctx); + kfree_rcu(ctx, rcu); return 0; } @@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); + + /* + * If clock has changed, we do not care about the + * ticks and we do not rearm the timer. Userspace must + * reevaluate anyway. + */ + if (timerfd_canceled(ctx)) { + ctx->ticks = 0; + ctx->expired = 0; + res = -ECANCELED; + } + if (ctx->ticks) { ticks = ctx->ticks; + if (ctx->expired && ctx->tintv.tv64) { /* * If tintv.tv64 != 0, this is a periodic timer that @@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); @@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, struct file *file; struct timerfd_ctx *ctx; struct itimerspec ktmr, kotmr; + int ret; if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; @@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return PTR_ERR(file); ctx = file->private_data; + timerfd_setup_cancel(ctx, flags); + /* * We need to stop the existing timer before reprogramming * it to the new values. @@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, /* * Re-program the timer to the new value ... */ - timerfd_setup(ctx, flags, &ktmr); + ret = timerfd_setup(ctx, flags, &ktmr); spin_unlock_irq(&ctx->wqh.lock); fput(file); if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) return -EFAULT; - return 0; + return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 8b3a7da531e..315de66e52b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c) long long liab; spin_lock(&c->space_lock); - liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; + liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth; spin_unlock(&c->space_lock); return liab; } @@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) int idx_lebs; long long idx_size; - idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; + idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx; /* And make sure we have thrice the index size of space reserved */ idx_size += idx_size << 1; /* @@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c) * budgeted index space to the size of the current index, multiplies this by 3, * and makes sure this does not exceed the amount of free LEBs. * - * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: + * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, - * the index may be consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. @@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c) c->lst.taken_empty_lebs; if (unlikely(rsvd_idx_lebs > lebs)) { dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " - "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, + "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs); return -ENOSPC; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; + outstanding = c->bi.data_growth + c->bi.dd_growth; if (unlikely(available < outstanding)) { dbg_budg("out of data space: available %lld, outstanding %lld", @@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c) if (available - outstanding <= c->rp_size && !can_use_rp(c)) return -ENOSPC; - c->min_idx_lebs = min_idx_lebs; + c->bi.min_idx_lebs = min_idx_lebs; return 0; } @@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c, { int data_growth; - data_growth = req->new_ino ? c->inode_budget : 0; + data_growth = req->new_ino ? c->bi.inode_budget : 0; if (req->new_page) - data_growth += c->page_budget; + data_growth += c->bi.page_budget; if (req->new_dent) - data_growth += c->dent_budget; + data_growth += c->bi.dent_budget; data_growth += req->new_ino_d; return data_growth; } @@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c, { int dd_growth; - dd_growth = req->dirtied_page ? c->page_budget : 0; + dd_growth = req->dirtied_page ? c->bi.page_budget : 0; if (req->dirtied_ino) - dd_growth += c->inode_budget << (req->dirtied_ino - 1); + dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); if (req->mod_dent) - dd_growth += c->dent_budget; + dd_growth += c->bi.dent_budget; dd_growth += req->dirtied_ino_d; return dd_growth; } @@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) again: spin_lock(&c->space_lock); - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->budg_dd_growth >= 0); + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); - if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { + if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) { dbg_budg("no space"); spin_unlock(&c->space_lock); return -ENOSPC; } - c->budg_idx_growth += idx_growth; - c->budg_data_growth += data_growth; - c->budg_dd_growth += dd_growth; + c->bi.idx_growth += idx_growth; + c->bi.data_growth += data_growth; + c->bi.dd_growth += dd_growth; err = do_budget_space(c); if (likely(!err)) { @@ -484,9 +484,9 @@ again: } /* Restore the old values */ - c->budg_idx_growth -= idx_growth; - c->budg_data_growth -= data_growth; - c->budg_dd_growth -= dd_growth; + c->bi.idx_growth -= idx_growth; + c->bi.data_growth -= data_growth; + c->bi.dd_growth -= dd_growth; spin_unlock(&c->space_lock); if (req->fast) { @@ -506,9 +506,9 @@ again: goto again; } dbg_budg("FS is full, -ENOSPC"); - c->nospace = 1; + c->bi.nospace = 1; if (can_use_rp(c) || c->rp_size == 0) - c->nospace_rp = 1; + c->bi.nospace_rp = 1; smp_wmb(); } else ubifs_err("cannot budget space, error %d", err); @@ -523,8 +523,8 @@ again: * This function releases the space budgeted by 'ubifs_budget_space()'. Note, * since the index changes (which were budgeted for in @req->idx_growth) will * only be written to the media on commit, this function moves the index budget - * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be - * zeroed by the commit operation. + * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed + * by the commit operation. */ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) { @@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) if (!req->data_growth && !req->dd_growth) return; - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); spin_lock(&c->space_lock); - c->budg_idx_growth -= req->idx_growth; - c->budg_uncommitted_idx += req->idx_growth; - c->budg_data_growth -= req->data_growth; - c->budg_dd_growth -= req->dd_growth; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); - - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->budg_dd_growth >= 0); - ubifs_assert(c->min_idx_lebs < c->main_lebs); - ubifs_assert(!(c->budg_idx_growth & 7)); - ubifs_assert(!(c->budg_data_growth & 7)); - ubifs_assert(!(c->budg_dd_growth & 7)); + c->bi.idx_growth -= req->idx_growth; + c->bi.uncommitted_idx += req->idx_growth; + c->bi.data_growth -= req->data_growth; + c->bi.dd_growth -= req->dd_growth; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + ubifs_assert(c->bi.min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->bi.idx_growth & 7)); + ubifs_assert(!(c->bi.data_growth & 7)); + ubifs_assert(!(c->bi.dd_growth & 7)); spin_unlock(&c->space_lock); } @@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c) { spin_lock(&c->space_lock); /* Release the index growth reservation */ - c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; /* Release the data growth reservation */ - c->budg_data_growth -= c->page_budget; + c->bi.data_growth -= c->bi.page_budget; /* Increase the dirty data growth reservation instead */ - c->budg_dd_growth += c->page_budget; + c->bi.dd_growth += c->bi.page_budget; /* And re-calculate the indexing space reservation */ - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); } @@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, memset(&req, 0, sizeof(struct ubifs_budget_req)); /* The "no space" flags will be cleared because dd_growth is > 0 */ - req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); + req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } @@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c) int rsvd_idx_lebs, lebs; long long available, outstanding, free; - ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); - outstanding = c->budg_data_growth + c->budg_dd_growth; - available = ubifs_calc_available(c, c->min_idx_lebs); + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->bi.data_growth + c->bi.dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); /* * When reporting free space to user-space, UBIFS guarantees that it is @@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c) * Note, the calculations below are similar to what we have in * 'do_budget_space()', so refer there for comments. */ - if (c->min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 1bd01ded712..87cd0ead863 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c) c->mst_node->root_len = cpu_to_le32(zroot.len); c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); - c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); + c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz); c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 004d3745dc4..0bb2bcef0de 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -34,7 +34,6 @@ #include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/math64.h> -#include <linux/slab.h> #ifdef CONFIG_UBIFS_FS_DEBUG @@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_msg_flags; unsigned int ubifs_chk_flags; unsigned int ubifs_tst_flags; -module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(debug_msgs, "Debug message type flags"); MODULE_PARM_DESC(debug_chks, "Debug check flags"); MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); @@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) printk(KERN_DEBUG "\tflags %#x\n", sup_flags); printk(KERN_DEBUG "\t big_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); + printk(KERN_DEBUG "\t space_fixup %u\n", + !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); printk(KERN_DEBUG "\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); printk(KERN_DEBUG "\tleb_size %u\n", @@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst) spin_unlock(&dbg_lock); } -void dbg_dump_budg(struct ubifs_info *c) +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) { int i; struct rb_node *rb; @@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c) struct ubifs_gced_idx_leb *idx_gc; long long available, outstanding, free; - ubifs_assert(spin_is_locked(&c->space_lock)); + spin_lock(&c->space_lock); spin_lock(&dbg_lock); - printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " - "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, - c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); - printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " - "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, - c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, - c->freeable_cnt); - printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " - "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, - c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); + printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " + "total budget sum %lld\n", current->pid, + bi->data_growth + bi->dd_growth, + bi->data_growth + bi->dd_growth + bi->idx_growth); + printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " + "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, + bi->idx_growth); + printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " + "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, + bi->uncommitted_idx); + printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", + bi->page_budget, bi->inode_budget, bi->dent_budget); + printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", + bi->nospace, bi->nospace_rp); + printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + + if (bi != &c->bi) + /* + * If we are dumping saved budgeting data, do not print + * additional information which is about the current state, not + * the old one which corresponded to the saved budgeting data. + */ + goto out_unlock; + + printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), atomic_long_read(&c->dirty_zn_cnt), atomic_long_read(&c->clean_zn_cnt)); - printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", - c->dark_wm, c->dead_wm, c->max_idx_node_sz); printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); + /* If we are in R/O mode, journal heads do not exist */ if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) @@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c) printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); /* Print budgeting predictions */ - available = ubifs_calc_available(c, c->min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; free = ubifs_get_free_space_nolock(c); printk(KERN_DEBUG "Budgeting predictions:\n"); printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", available, outstanding, free); +out_unlock: spin_unlock(&dbg_lock); + spin_unlock(&c->space_lock); } void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) @@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) if (bud->lnum == lp->lnum) { int head = 0; for (i = 0; i < c->jhead_cnt; i++) { - if (lp->lnum == c->jheads[i].wbuf.lnum) { + /* + * Note, if we are in R/O mode or in the middle + * of mounting/re-mounting, the write-buffers do + * not exist. + */ + if (c->jheads && + lp->lnum == c->jheads[i].wbuf.lnum) { printk(KERN_CONT ", jhead %s", dbg_jhead(i)); head = 1; @@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c) spin_lock(&c->space_lock); memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info)); + d->saved_idx_gc_cnt = c->idx_gc_cnt; /* * We use a dirty hack here and zero out @c->freeable_cnt, because it @@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c) out: ubifs_msg("saved lprops statistics dump"); dbg_dump_lstats(&d->saved_lst); - ubifs_get_lp_stats(c, &lst); - + ubifs_msg("saved budgeting info dump"); + dbg_dump_budg(c, &d->saved_bi); + ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); ubifs_msg("current lprops statistics dump"); + ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); - - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + ubifs_msg("current budgeting info dump"); + dbg_dump_budg(c, &c->bi); dump_stack(); return -EINVAL; } @@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, struct rb_node **p, *parent = NULL; struct fsck_inode *fscki; ino_t inum = key_inum_flash(c, &ino->key); + struct inode *inode; + struct ubifs_inode *ui; p = &fsckd->inodes.rb_node; while (*p) { @@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, if (!fscki) return ERR_PTR(-ENOMEM); + inode = ilookup(c->vfs_sb, inum); + fscki->inum = inum; - fscki->nlink = le32_to_cpu(ino->nlink); - fscki->size = le64_to_cpu(ino->size); - fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); - fscki->xattr_sz = le32_to_cpu(ino->xattr_size); - fscki->xattr_nms = le32_to_cpu(ino->xattr_names); - fscki->mode = le32_to_cpu(ino->mode); + /* + * If the inode is present in the VFS inode cache, use it instead of + * the on-flash inode which might be out-of-date. E.g., the size might + * be out-of-date. If we do not do this, the following may happen, for + * example: + * 1. A power cut happens + * 2. We mount the file-system R/O, the replay process fixes up the + * inode size in the VFS cache, but on on-flash. + * 3. 'check_leaf()' fails because it hits a data node beyond inode + * size. + */ + if (!inode) { + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + } else { + ui = ubifs_inode(inode); + fscki->nlink = inode->i_nlink; + fscki->size = inode->i_size; + fscki->xattr_cnt = ui->xattr_cnt; + fscki->xattr_sz = ui->xattr_size; + fscki->xattr_nms = ui->xattr_names; + fscki->mode = inode->i_mode; + iput(inode); + } + if (S_ISDIR(fscki->mode)) { fscki->calc_sz = UBIFS_INO_NODE_SZ; fscki->calc_cnt = 2; } + rb_link_node(&fscki->rb, parent, p); rb_insert_color(&fscki->rb, &fsckd->inodes); + return fscki; } @@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) hashb = key_block(c, &sb->key); if (hasha > hashb) { - ubifs_err("larger hash %u goes before %u", hasha, hashb); + ubifs_err("larger hash %u goes before %u", + hasha, hashb); goto error_dump; } } @@ -2437,14 +2491,12 @@ error_dump: return 0; } -static int invocation_cnt; - int dbg_force_in_the_gaps(void) { - if (!dbg_force_in_the_gaps_enabled) + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) return 0; - /* Force in-the-gaps every 8th commit */ - return !((invocation_cnt++) & 0x7); + + return !(random32() & 7); } /* Failure mode for recovery testing */ @@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, int len, int check) { if (in_failure_mode(desc)) - return -EIO; + return -EROFS; return ubi_leb_read(desc, lnum, buf, offset, len, check); } @@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, int err, failing; if (in_failure_mode(desc)) - return -EIO; + return -EROFS; failing = do_fail(desc, lnum, 1); if (failing) cut_data(buf, len); @@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, if (err) return err; if (failing) - return -EIO; + return -EROFS; return 0; } @@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, int err; if (do_fail(desc, lnum, 1)) - return -EIO; + return -EROFS; err = ubi_leb_change(desc, lnum, buf, len, dtype); if (err) return err; if (do_fail(desc, lnum, 1)) - return -EIO; + return -EROFS; return 0; } @@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_erase(desc, lnum); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } @@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_unmap(desc, lnum); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) { if (in_failure_mode(desc)) - return -EIO; + return -EROFS; return ubi_is_mapped(desc, lnum); } @@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_map(desc, lnum, dtype); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } @@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void) static int open_debugfs_file(struct inode *inode, struct file *file) { file->private_data = inode->i_private; - return 0; + return nonseekable_open(inode, file); } static ssize_t write_debugfs_file(struct file *file, const char __user *buf, @@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf, if (file->f_path.dentry == d->dfs_dump_lprops) dbg_dump_lprops(c); - else if (file->f_path.dentry == d->dfs_dump_budg) { - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - } else if (file->f_path.dentry == d->dfs_dump_tnc) { + else if (file->f_path.dentry == d->dfs_dump_budg) + dbg_dump_budg(c, &c->bi); + else if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); dbg_dump_tnc(c); mutex_unlock(&c->tnc_mutex); } else return -EINVAL; - *ppos += count; return count; } @@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = { .open = open_debugfs_file, .write = write_debugfs_file, .owner = THIS_MODULE, - .llseek = default_llseek, + .llseek = no_llseek, }; /** diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index e6493cac193..a811ac4a26b 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, #ifdef CONFIG_UBIFS_FS_DEBUG +#include <linux/random.h> + /** * ubifs_debug_info - per-FS debugging information. * @old_zroot: old index root - used by 'dbg_check_old_index()' @@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, * @new_ihead_offs: used by debugging to check @c->ihead_offs * * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') - * @saved_free: saved free space (used by 'dbg_save_space_info()') + * @saved_bi: saved budgeting information + * @saved_free: saved amount of free space + * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt * - * dfs_dir_name: name of debugfs directory containing this file-system's files - * dfs_dir: direntry object of the file-system debugfs directory - * dfs_dump_lprops: "dump lprops" debugfs knob - * dfs_dump_budg: "dump budgeting information" debugfs knob - * dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_dir_name: name of debugfs directory containing this file-system's files + * @dfs_dir: direntry object of the file-system debugfs directory + * @dfs_dump_lprops: "dump lprops" debugfs knob + * @dfs_dump_budg: "dump budgeting information" debugfs knob + * @dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { struct ubifs_zbranch old_zroot; @@ -76,7 +80,9 @@ struct ubifs_debug_info { int new_ihead_offs; struct ubifs_lp_stats saved_lst; + struct ubifs_budg_info saved_bi; long long saved_free; + int saved_idx_gc_cnt; char dfs_dir_name[100]; struct dentry *dfs_dir; @@ -101,23 +107,7 @@ struct ubifs_debug_info { } \ } while (0) -#define dbg_dump_stack() do { \ - if (!dbg_failure_mode) \ - dump_stack(); \ -} while (0) - -/* Generic debugging messages */ -#define dbg_msg(fmt, ...) do { \ - spin_lock(&dbg_lock); \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ - __func__, ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ -} while (0) - -#define dbg_do_msg(typ, fmt, ...) do { \ - if (ubifs_msg_flags & typ) \ - dbg_msg(fmt, ##__VA_ARGS__); \ -} while (0) +#define dbg_dump_stack() dump_stack() #define dbg_err(fmt, ...) do { \ spin_lock(&dbg_lock); \ @@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) -/* General messages */ -#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) +#define ubifs_dbg_msg(type, fmt, ...) do { \ + spin_lock(&dbg_lock); \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) +/* Just a debugging messages not related to any specific UBIFS subsystem */ +#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +/* General messages */ +#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ -#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) - +#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) /* Additional TNC messages */ -#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) - +#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) /* Additional lprops messages */ -#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) - +#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ -#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) - +#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ -#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) - +#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) /* Additional I/O messages */ -#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) - +#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ -#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) - +#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__) /* Additional budgeting messages */ -#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) - +#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__) /* Additional log messages */ -#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) - +#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__) /* Additional gc messages */ -#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) - +#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__) /* Additional scan messages */ -#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) - +#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__) /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) - -/* - * Debugging message type flags. - * - * UBIFS_MSG_GEN: general messages - * UBIFS_MSG_JNL: journal messages - * UBIFS_MSG_MNT: mount messages - * UBIFS_MSG_CMT: commit messages - * UBIFS_MSG_FIND: LEB find messages - * UBIFS_MSG_BUDG: budgeting messages - * UBIFS_MSG_GC: garbage collection messages - * UBIFS_MSG_TNC: TNC messages - * UBIFS_MSG_LP: lprops messages - * UBIFS_MSG_IO: I/O messages - * UBIFS_MSG_LOG: log messages - * UBIFS_MSG_SCAN: scan messages - * UBIFS_MSG_RCVRY: recovery messages - */ -enum { - UBIFS_MSG_GEN = 0x1, - UBIFS_MSG_JNL = 0x2, - UBIFS_MSG_MNT = 0x4, - UBIFS_MSG_CMT = 0x8, - UBIFS_MSG_FIND = 0x10, - UBIFS_MSG_BUDG = 0x20, - UBIFS_MSG_GC = 0x40, - UBIFS_MSG_TNC = 0x80, - UBIFS_MSG_LP = 0x100, - UBIFS_MSG_IO = 0x200, - UBIFS_MSG_LOG = 0x400, - UBIFS_MSG_SCAN = 0x800, - UBIFS_MSG_RCVRY = 0x1000, -}; +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) /* * Debugging check flags. @@ -233,11 +186,9 @@ enum { /* * Special testing flags. * - * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method * UBIFS_TST_RCVRY: failure mode for recovery testing */ enum { - UBIFS_TST_FORCE_IN_THE_GAPS = 0x2, UBIFS_TST_RCVRY = 0x4, }; @@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, int offs); void dbg_dump_budget_req(const struct ubifs_budget_req *req); void dbg_dump_lstats(const struct ubifs_lp_stats *lst); -void dbg_dump_budg(struct ubifs_info *c); +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); @@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); /* Force the use of in-the-gaps method for testing */ - -#define dbg_force_in_the_gaps_enabled \ - (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) - +static inline int dbg_force_in_the_gaps_enabled(void) +{ + return ubifs_chk_flags & UBIFS_CHK_GEN; +} int dbg_force_in_the_gaps(void); /* Failure mode for recovery testing */ - #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) #ifndef UBIFS_DBG_PRESERVE_UBI - #define ubi_leb_read dbg_leb_read #define ubi_leb_write dbg_leb_write #define ubi_leb_change dbg_leb_change @@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void); #define ubi_leb_unmap dbg_leb_unmap #define ubi_is_mapped dbg_is_mapped #define ubi_leb_map dbg_leb_map - #endif int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, @@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); __func__, __LINE__, current->pid); \ } while (0) -#define dbg_err(fmt, ...) do { \ - if (0) \ - ubifs_err(fmt, ##__VA_ARGS__); \ +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ } while (0) -#define dbg_msg(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__); \ +#define ubifs_dbg_msg(fmt, ...) do { \ + if (0) \ + pr_debug(fmt "\n", ##__VA_ARGS__); \ } while (0) #define dbg_dump_stack() #define ubifs_assert_cmt_locked(c) -#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) @@ -420,7 +368,9 @@ static inline void dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } static inline void dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } -static inline void dbg_dump_budg(struct ubifs_info *c) { return; } +static inline void +dbg_dump_budg(struct ubifs_info *c, + const struct ubifs_budg_info *bi) { return; } static inline void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { return; } static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } @@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) { return 0; } static inline int dbg_force_in_the_gaps(void) { return 0; } -#define dbg_force_in_the_gaps_enabled 0 -#define dbg_failure_mode 0 +#define dbg_force_in_the_gaps_enabled() 0 +#define dbg_failure_mode 0 static inline int dbg_debugfs_init(void) { return 0; } static inline void dbg_debugfs_exit(void) { return; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7217d67a80a..c2b80943560 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; @@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + dentry_unhash(dentry); + /* * Budget request settings: deletion direntry, deletion inode and * changing the parent inode. If budgeting fails, go ahead anyway @@ -693,7 +695,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; @@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b286db79c68..5e7fccfc4b2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c) */ static void release_existing_page_budget(struct ubifs_info *c) { - struct ubifs_budget_req req = { .dd_growth = c->page_budget}; + struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget}; ubifs_release_budget(c, &req); } @@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len) * the page locked, and it locks @ui_mutex. However, write-back does take inode * @i_mutex, which means other VFS operations may be run on this inode at the * same time. And the problematic one is truncation to smaller size, from where - * we have to call 'truncate_setsize()', which first changes @inode->i_size, then - * drops the truncated pages. And while dropping the pages, it takes the page - * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with - * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This - * means that @inode->i_size is changed while @ui_mutex is unlocked. + * we have to call 'truncate_setsize()', which first changes @inode->i_size, + * then drops the truncated pages. And while dropping the pages, it takes the + * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' + * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. + * This means that @inode->i_size is changed while @ui_mutex is unlocked. * * XXX(truncate): with the new truncate sequence this is not true anymore, * and the calls to truncate_setsize can be move around freely. They should @@ -1189,7 +1189,7 @@ out_budg: if (budgeted) ubifs_release_budget(c, &req); else { - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return err; @@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync) dbg_gen("syncing inode %lu", inode->i_ino); - if (inode->i_sb->s_flags & MS_RDONLY) + if (c->ro_mount) + /* + * For some really strange reasons VFS does not filter out + * 'fsync()' for R/O mounted file-systems as per 2.6.39. + */ return 0; /* @@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) } /* - * mmap()d file has taken write protection fault and is being made - * writable. UBIFS must ensure page is budgeted for. + * mmap()d file has taken write protection fault and is being made writable. + * UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; @@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int err; - /* 'generic_file_mmap()' takes care of NOMMU case */ err = generic_file_mmap(file, vma); if (err) return err; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 1d54383d126..2559d174e00 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, * But if the index takes fewer LEBs than it is reserved for it, * this function must avoid picking those reserved LEBs. */ - if (c->min_idx_lebs >= c->lst.idx_lebs) { - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; exclude_index = 1; } spin_unlock(&c->space_lock); @@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, pick_free = 0; } else { spin_lock(&c->space_lock); - exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); + exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs); spin_unlock(&c->space_lock); } @@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, /* Check if there are enough empty LEBs for commit */ spin_lock(&c->space_lock); - if (c->min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 151f1088282..ded29f6224c 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; @@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c) * This function compares data nodes @a and @b. Returns %1 if @a has greater * inode or block number, and %-1 otherwise. */ -int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) * first and sorted by length in descending order. Directory entry nodes go * after inode nodes and are sorted in ascending hash valuer order. */ -int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int nondata_nodes_cmp(void *priv, struct list_head *a, + struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) ubifs_assert(c->gc_lnum != lnum); ubifs_assert(wbuf->lnum != lnum); + if (lp->free + lp->dirty == c->leb_size) { + /* Special case - a free LEB */ + dbg_gc("LEB %d is free, return it", lp->lnum); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + + if (lp->free != c->leb_size) { + /* + * Write buffers must be sync'd before unmapping + * freeable LEBs, because one of them may contain data + * which obsoletes something in 'lp->pnum'. + */ + err = gc_sync_wbufs(c); + if (err) + return err; + err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + return err; + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + return LEB_RETAINED; + } + + return LEB_FREED; + } + /* * We scan the entire LEB even though we only really need to scan up to * (c->leb_size - lp->free). @@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) "(min. space %d)", lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, min_space); - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - dbg_gc("LEB %d is free, return it", lp.lnum); - /* - * ubifs_find_dirty_leb() doesn't return freeable index - * LEBs. - */ - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - if (lp.free != c->leb_size) { - /* - * Write buffers must be sync'd before - * unmapping freeable LEBs, because one of them - * may contain data which obsoletes something - * in 'lp.pnum'. - */ - ret = gc_sync_wbufs(c); - if (ret) - goto out; - ret = ubifs_change_one_lp(c, lp.lnum, - c->leb_size, 0, 0, 0, - 0); - if (ret) - goto out; - } - ret = ubifs_leb_unmap(c, lp.lnum); - if (ret) - goto out; - ret = lp.lnum; - break; - } - space_before = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum == -1) space_before = 0; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index dfd168b7807..166951e0dcd 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); if (c->leb_size - wbuf->offs >= c->max_write_size) - ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); if (c->ro_error) return -EROFS; @@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @dtype: data type * * This function targets the write-buffer to logical eraseblock @lnum:@offs. - * The write-buffer is synchronized if it is not empty. Returns zero in case of - * success and a negative error code in case of failure. + * The write-buffer has to be empty. Returns zero in case of success and a + * negative error code in case of failure. */ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype) @@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, ubifs_assert(offs >= 0 && offs <= c->leb_size); ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); ubifs_assert(lnum != wbuf->lnum); - - if (wbuf->used > 0) { - int err = ubifs_wbuf_sync_nolock(wbuf); - - if (err) - return err; - } + ubifs_assert(wbuf->used == 0); spin_lock(&wbuf->lock); wbuf->lnum = lnum; @@ -573,7 +567,7 @@ out_timers: int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) { struct ubifs_info *c = wbuf->c; - int err, written, n, aligned_len = ALIGN(len, 8), offs; + int err, written, n, aligned_len = ALIGN(len, 8); dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, dbg_ntype(((struct ubifs_ch *)buf)->node_type), @@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); if (c->leb_size - wbuf->offs >= c->max_write_size) - ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - offs = wbuf->offs; written = 0; if (wbuf->used) { @@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (err) goto out; - offs += wbuf->size; + wbuf->offs += wbuf->size; len -= wbuf->avail; aligned_len -= wbuf->avail; written += wbuf->avail; @@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (err) goto out; - offs += wbuf->size; + wbuf->offs += wbuf->size; len -= wbuf->size; aligned_len -= wbuf->size; written += wbuf->size; @@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) n = aligned_len >> c->max_write_shift; if (n) { n <<= c->max_write_shift; - dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, - wbuf->dtype); + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, + wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, + wbuf->offs, n, wbuf->dtype); if (err) goto out; - offs += n; + wbuf->offs += n; aligned_len -= n; len -= n; written += n; @@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ memcpy(wbuf->buf, buf + written, len); - wbuf->offs = offs; if (c->leb_size - wbuf->offs >= c->max_write_size) wbuf->size = c->max_write_size; else diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index aed25e86422..34b1679e6e3 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -141,14 +141,8 @@ again: * LEB with some empty space. */ lnum = ubifs_find_free_space(c, len, &offs, squeeze); - if (lnum >= 0) { - /* Found an LEB, add it to the journal head */ - err = ubifs_add_bud_to_log(c, jhead, lnum, offs); - if (err) - goto out_return; - /* A new bud was successfully allocated and added to the log */ + if (lnum >= 0) goto out; - } err = lnum; if (err != -ENOSPC) @@ -203,12 +197,23 @@ again: return 0; } - err = ubifs_add_bud_to_log(c, jhead, lnum, 0); - if (err) - goto out_return; offs = 0; out: + /* + * Make sure we synchronize the write-buffer before we add the new bud + * to the log. Otherwise we may have a power cut after the log + * reference node for the last bud (@lnum) is written but before the + * write-buffer data are written to the next-to-last bud + * (@wbuf->lnum). And the effect would be that the recovery would see + * that there is corruption in the next-to-last bud. + */ + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out_return; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); if (err) goto out_unlock; @@ -380,10 +385,8 @@ out: if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ down_write(&c->commit_sem); - spin_lock(&c->space_lock); dbg_dump_stack(); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 40fa780ebea..affea9494ae 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) } /** - * next_log_lnum - switch to the next log LEB. - * @c: UBIFS file-system description object - * @lnum: current log LEB - */ -static inline int next_log_lnum(const struct ubifs_info *c, int lnum) -{ - lnum += 1; - if (lnum > c->log_last) - lnum = UBIFS_LOG_LNUM; - - return lnum; -} - -/** * empty_log_bytes - calculate amount of empty space in the log. * @c: UBIFS file-system description object */ @@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) ref->jhead = cpu_to_le32(jhead); if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) /* Switch to the next log LEB */ if (c->lhead_offs) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) c->lhead_offs += len; if (c->lhead_offs == c->leb_size) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) } mutex_lock(&c->log_mutex); for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; - lnum = next_log_lnum(c, lnum)) { + lnum = ubifs_next_log_lnum(c, lnum)) { dbg_log("unmap log LEB %d", lnum); err = ubifs_leb_unmap(c, lnum); if (err) @@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); if (err) return err; - *lnum = next_log_lnum(c, *lnum); + *lnum = ubifs_next_log_lnum(c, *lnum); *offs = 0; } memcpy(buf + *offs, node, len); @@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) ubifs_scan_destroy(sleb); if (lnum == c->lhead_lnum) break; - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); } if (offs) { int sz = ALIGN(offs, c->min_io_size); @@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) /* Unmap remaining LEBs */ lnum = write_lnum; do { - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); err = ubifs_leb_unmap(c, lnum); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 0ee0847f242..667884f4a61 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1007,21 +1007,11 @@ out: } /** - * struct scan_check_data - data provided to scan callback function. - * @lst: LEB properties statistics - * @err: error code - */ -struct scan_check_data { - struct ubifs_lp_stats lst; - int err; -}; - -/** * scan_check_cb - scan callback. * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @lst: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -1030,11 +1020,10 @@ struct scan_check_data { */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct scan_check_data *data) + struct ubifs_lp_stats *lst) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_lp_stats *lst = &data->lst; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; void *buf = NULL; @@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c, if (cat != (lp->flags & LPROPS_CAT_MASK)) { ubifs_err("bad LEB category %d expected %d", (lp->flags & LPROPS_CAT_MASK), cat); - goto out; + return -EINVAL; } } @@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c, } if (!found) { ubifs_err("bad LPT list (category %d)", cat); - goto out; + return -EINVAL; } } } @@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c, if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || lp != heap->arr[lp->hpos]) { ubifs_err("bad LPT heap (category %d)", cat); - goto out; + return -EINVAL; } } buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); - if (!buf) { - ubifs_err("cannot allocate memory to scan LEB %d", lnum); - goto out; + if (!buf) + return -ENOMEM; + + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage - do not scan them. + */ + if (lp->free == c->leb_size) { + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; } sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { - /* - * After an unclean unmount, empty and freeable LEBs - * may contain garbage. - */ - if (lp->free == c->leb_size) { - ubifs_err("scan errors were in empty LEB " - "- continuing checking"); - lst->empty_lebs += 1; - lst->total_free += c->leb_size; - lst->total_dark += ubifs_calc_dark(c, c->leb_size); - ret = LPT_SCAN_CONTINUE; - goto exit; - } - - if (lp->free + lp->dirty == c->leb_size && - !(lp->flags & LPROPS_INDEX)) { - ubifs_err("scan errors were in freeable LEB " - "- continuing checking"); - lst->total_free += lp->free; - lst->total_dirty += lp->dirty; - lst->total_dark += ubifs_calc_dark(c, c->leb_size); - ret = LPT_SCAN_CONTINUE; - goto exit; + ret = PTR_ERR(sleb); + if (ret == -EUCLEAN) { + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); } - data->err = PTR_ERR(sleb); - ret = LPT_SCAN_STOP; - goto exit; + goto out; } is_idx = -1; @@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - ret = LPT_SCAN_CONTINUE; -exit: vfree(buf); - return ret; + return LPT_SCAN_CONTINUE; out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " @@ -1258,10 +1240,10 @@ out_print: dbg_dump_leb(c, lnum); out_destroy: ubifs_scan_destroy(sleb); + ret = -EINVAL; out: vfree(buf); - data->err = -EINVAL; - return LPT_SCAN_STOP; + return ret; } /** @@ -1278,8 +1260,7 @@ out: int dbg_check_lprops(struct ubifs_info *c) { int i, err; - struct scan_check_data data; - struct ubifs_lp_stats *lst = &data.lst; + struct ubifs_lp_stats lst; if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) return 0; @@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c) return err; } - memset(lst, 0, sizeof(struct ubifs_lp_stats)); - - data.err = 0; + memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, (ubifs_lpt_scan_callback)scan_check_cb, - &data); + &lst); if (err && err != -ENOSPC) goto out; - if (data.err) { - err = data.err; - goto out; - } - if (lst->empty_lebs != c->lst.empty_lebs || - lst->idx_lebs != c->lst.idx_lebs || - lst->total_free != c->lst.total_free || - lst->total_dirty != c->lst.total_dirty || - lst->total_used != c->lst.total_used) { + if (lst.empty_lebs != c->lst.empty_lebs || + lst.idx_lebs != c->lst.idx_lebs || + lst.total_free != c->lst.total_free || + lst.total_dirty != c->lst.total_dirty || + lst.total_used != c->lst.total_used) { ubifs_err("bad overall accounting"); ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " "total_free %lld, total_dirty %lld, total_used %lld", - lst->empty_lebs, lst->idx_lebs, lst->total_free, - lst->total_dirty, lst->total_used); + lst.empty_lebs, lst.idx_lebs, lst.total_free, + lst.total_dirty, lst.total_used); ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " "total_free %lld, total_dirty %lld, total_used %lld", c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, @@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c) goto out; } - if (lst->total_dead != c->lst.total_dead || - lst->total_dark != c->lst.total_dark) { + if (lst.total_dead != c->lst.total_dead || + lst.total_dark != c->lst.total_dark) { ubifs_err("bad dead/dark space accounting"); ubifs_err("calculated: total_dead %lld, total_dark %lld", - lst->total_dead, lst->total_dark); + lst.total_dead, lst.total_dark); ubifs_err("read from lprops: total_dead %lld, total_dark %lld", c->lst.total_dead, c->lst.total_dark); err = -EINVAL; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 0c9c69bd983..dfcb5748a7d 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -29,6 +29,12 @@ #include <linux/slab.h> #include "ubifs.h" +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_populate_lsave(struct ubifs_info *c); +#else +#define dbg_populate_lsave(c) 0 +#endif + /** * first_dirty_cnode - find first dirty cnode. * @c: UBIFS file-system description object @@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, if (nnode->nbranch[iip].lnum) break; } - } while (iip >= UBIFS_LPT_FANOUT); + } while (iip >= UBIFS_LPT_FANOUT); /* Go right */ nnode = ubifs_get_nnode(c, nnode, iip); @@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c) c->lpt_drty_flgs |= LSAVE_DIRTY; ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); } + + if (dbg_populate_lsave(c)) + return; + list_for_each_entry(lprops, &c->empty_list, list) { c->lsave[cnt++] = lprops->lnum; if (cnt >= c->lsave_cnt) @@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c) current->pid); } +/** + * dbg_populate_lsave - debugging version of 'populate_lsave()' + * @c: UBIFS file-system description object + * + * This is a debugging version for 'populate_lsave()' which populates lsave + * with random LEBs instead of useful LEBs, which is good for test coverage. + * Returns zero if lsave has not been populated (this debugging feature is + * disabled) an non-zero if lsave has been populated. + */ +static int dbg_populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (random32() & 3) + return 0; + + for (i = 0; i < c->lsave_cnt; i++) + c->lsave[i] = c->main_first; + + list_for_each_entry(lprops, &c->empty_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->freeable_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->frdi_idx_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + + return 1; +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 21f47afdacf..278c2382e8c 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c) } main_sz = (long long)c->main_lebs * c->leb_size; - if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { + if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) { err = 9; goto out; } @@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c) } if (c->lst.total_dead + c->lst.total_dark + - c->lst.total_used + c->old_idx_sz > main_sz) { + c->lst.total_used + c->bi.old_idx_sz > main_sz) { err = 21; goto out; } @@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c) c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); - c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size); c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); @@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c) c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); - c->calc_idx_sz = c->old_idx_sz; + c->calc_idx_sz = c->bi.old_idx_sz; if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) c->no_orphs = 1; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index c3de04dc952..0b5296a9a4c 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c) mutex_unlock(&c->lp_mutex); } +/** + * ubifs_next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + * + * This helper function returns the log LEB number which goes next after LEB + * 'lnum'. + */ +static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 09df318e368..bd644bf587a 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c) sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) { if (PTR_ERR(sleb) == -EUCLEAN) - sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + sleb = ubifs_recover_leb(c, lnum, 0, + c->sbuf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 3dbad6fbd1e..731d9e2e7b5 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, } /** - * drop_incomplete_group - drop nodes from an incomplete group. + * drop_last_node - drop the last node or group of nodes. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here + * @grouped: non-zero if whole group of nodes have to be dropped * - * This function returns %1 if nodes are dropped and %0 otherwise. + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB or the last group of nodes if @grouped is not zero. + * This function returns %1 if a node was dropped and %0 otherwise. */ -static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) +static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) { int dropped = 0; @@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) kfree(snod); sleb->nodes_cnt -= 1; dropped = 1; + if (!grouped) + break; } return dropped; } @@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int grouped) { - int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; - int empty_chkd = 0, start = offs; + int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; struct ubifs_scan_leb *sleb; void *buf = sbuf + offs; @@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, if (IS_ERR(sleb)) return sleb; - if (sleb->ecc) - need_clean = 1; - + ubifs_assert(len >= 8); while (len >= 8) { - int ret; - dbg_scan("look at LEB %d:%d (%d bytes left)", lnum, offs, len); @@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * Scan quietly until there is an error from which we cannot * recover */ - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); - + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); if (ret == SCANNED_A_NODE) { /* A valid node, and not a padding node */ struct ubifs_ch *ch = buf; @@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, offs += node_len; buf += node_len; len -= node_len; - continue; - } - - if (ret > 0) { + } else if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; buf += ret; len -= ret; - continue; - } - - if (ret == SCANNED_EMPTY_SPACE) { - if (!is_empty(buf, len)) { - if (!is_last_write(c, buf, offs)) - break; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } - empty_chkd = 1; + } else if (ret == SCANNED_EMPTY_SPACE || + ret == SCANNED_GARBAGE || + ret == SCANNED_A_BAD_PAD_NODE || + ret == SCANNED_A_CORRUPT_NODE) { + dbg_rcvry("found corruption - %d", ret); break; - } - - if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } - - if (ret == SCANNED_A_CORRUPT_NODE) - if (no_more_nodes(c, buf, len, lnum, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } - - if (quiet) { - /* Redo the last scan but noisily */ - quiet = 0; - continue; - } - - switch (ret) { - case SCANNED_GARBAGE: - dbg_err("garbage"); - goto corrupted; - case SCANNED_A_CORRUPT_NODE: - case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); - goto corrupted; - default: - dbg_err("unknown"); + } else { + dbg_err("unexpected return value %d", ret); err = -EINVAL; goto error; } } - if (!empty_chkd && !is_empty(buf, len)) { - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } else { + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) { + if (!is_last_write(c, buf, offs)) + goto corrupted_rescan; + } else if (ret == SCANNED_A_CORRUPT_NODE) { + if (!no_more_nodes(c, buf, len, lnum, offs)) + goto corrupted_rescan; + } else if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) { int corruption = first_non_ff(buf, len); /* @@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } } - /* Drop nodes from incomplete group */ - if (grouped && drop_incomplete_group(sleb, &offs)) { - buf = sbuf + offs; - len = c->leb_size - offs; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } + min_io_unit = round_down(offs, c->min_io_size); + if (grouped) + /* + * If nodes are grouped, always drop the incomplete group at + * the end. + */ + drop_last_node(sleb, &offs, 1); - if (offs % c->min_io_size) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } + /* + * While we are in the middle of the same min. I/O unit keep dropping + * nodes. So basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely with all + * the uncorrupted node which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the corruption + * starts B, and the previous min. I/O unit A. The below code tries to + * deal with a situation when half of B contains valid nodes or the end + * of a valid node, and the second half of B contains corrupted data or + * garbage. This means that UBIFS had been writing to B just before the + * power cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully and the + * other half not, but this is possible in our 'failure mode emulation' + * infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Whey can't + * we just clean-up the second half of B by putting a padding node + * there? We can, and this works fine with one exception which was + * reproduced with power cut emulation testing and happens extremely + * rarely. The description follows, but it is worth noting that that is + * only about the GC head, so we could do this trick only if the bud + * belongs to the GC head, but it does not seem to be worth an + * additional "if" statement. + * + * So, imagine the file-system is full, we run GC which is moving valid + * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head + * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X + * and will try to continue. Imagine that LEB X is currently the + * dirtiest LEB, and the amount of used space in LEB Y is exactly the + * same as amount of free space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to LEB Y. We + * are here trying to recover LEB Y which is the GC head LEB. We find + * the min. I/O unit B as described above. Then we clean-up LEB Y by + * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function + * fails, because it cannot find a dirty LEB which could be GC'd into + * LEB Y! Even LEB X does not match because the amount of valid nodes + * there does not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and analysed is + * that we cannot mount the file-system with -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we should + * free min. I/O unit B in LEB Y completely and the last used min. I/O + * unit in LEB Y should be A. This is basically what the below code + * tries to do. + */ + while (min_io_unit == round_down(offs, c->min_io_size) && + min_io_unit != offs && + drop_last_node(sleb, &offs, grouped)); + + buf = sbuf + offs; + len = c->leb_size - offs; + clean_buf(c, &buf, lnum, &offs, &len); ubifs_end_scan(c, sleb, lnum, offs); - if (need_clean) { - err = fix_unclean_leb(c, sleb, start); - if (err) - goto error; - } + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; return sleb; +corrupted_rescan: + /* Re-scan the corrupted data with verbose messages */ + dbg_err("corruptio %d", ret); + ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); err = -EUCLEAN; @@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) } /** + * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit. + * @c: UBIFS file-system description object + * + * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty + * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns + * zero in case of success and a negative error code in case of failure. + */ +static int grab_empty_leb(struct ubifs_info *c) +{ + int lnum, err; + + /* + * Note, it is very important to first search for an empty LEB and then + * run the commit, not vice-versa. The reason is that there might be + * only one empty LEB at the moment, the one which has been the + * @c->gc_lnum just before the power cut happened. During the regular + * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no + * one but GC can grab it. But at this moment this single empty LEB is + * not marked as taken, so if we run commit - what happens? Right, the + * commit will grab it and write the index there. Remember that the + * index always expands as long as there is free space, and it only + * starts consolidating when we run out of space. + * + * IOW, if we run commit now, we might not be able to find a free LEB + * after this. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + dbg_err("could not find an empty LEB"); + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); + return lnum; + } + + /* Reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + + c->gc_lnum = lnum; + dbg_rcvry("found empty LEB %d, run commit", lnum); + + return ubifs_run_commit(c); +} + +/** * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. * @c: UBIFS file-system description object * @@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) { struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; struct ubifs_lprops lp; - int lnum, err; + int err; + + dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs); c->gc_lnum = -1; - if (wbuf->lnum == -1) { - dbg_rcvry("no GC head LEB"); - goto find_free; - } - /* - * See whether the used space in the dirtiest LEB fits in the GC head - * LEB. - */ - if (wbuf->offs == c->leb_size) { - dbg_rcvry("no room in GC head LEB"); - goto find_free; - } + if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) + return grab_empty_leb(c); + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); if (err) { - /* - * There are no dirty or empty LEBs subject to here being - * enough for the index. Try to use - * 'ubifs_find_free_leb_for_idx()', which will return any empty - * LEBs (ignoring index requirements). If the index then - * doesn't have enough LEBs the recovery commit will fail - - * which is the same result anyway i.e. recovery fails. So - * there is no problem ignoring index requirements and just - * grabbing a free LEB since we have already established there - * is not a dirty LEB we could have used instead. - */ - if (err == -ENOSPC) { - dbg_rcvry("could not find a dirty LEB"); - goto find_free; - } - return err; - } - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - lnum = lp.lnum; - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - if (lp.free != c->leb_size) { - err = ubifs_change_one_lp(c, lnum, c->leb_size, - 0, 0, 0, 0); - if (err) - return err; - } - err = ubifs_leb_unmap(c, lnum); - if (err) + if (err != -ENOSPC) return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); - } - /* - * There was no empty LEB so the used space in the dirtiest LEB must fit - * in the GC head LEB. - */ - if (lp.free + lp.dirty < wbuf->offs) { - dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d", - lnum, wbuf->lnum, wbuf->offs); - err = ubifs_return_leb(c, lnum); - if (err) - return err; - goto find_free; + + dbg_rcvry("could not find a dirty LEB"); + return grab_empty_leb(c); } + + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + ubifs_assert(lp.free + lp.dirty >= wbuf->offs); + /* * We run the commit before garbage collection otherwise subsequent * mounts will see the GC and orphan deletion in a different order. @@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) err = ubifs_run_commit(c); if (err) return err; - /* - * The data in the dirtiest LEB fits in the GC head LEB, so do the GC - * - use locking to keep 'ubifs_assert()' happy. - */ - dbg_rcvry("GC'ing LEB %d", lnum); + + dbg_rcvry("GC'ing LEB %d", lp.lnum); mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); err = ubifs_garbage_collect_leb(c, &lp); if (err >= 0) { @@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) err = -EINVAL; return err; } - if (err != LEB_RETAINED) { - dbg_err("GC returned %d", err); + + ubifs_assert(err == LEB_RETAINED); + if (err != LEB_RETAINED) return -EINVAL; - } + err = ubifs_leb_unmap(c, c->gc_lnum); if (err) return err; - dbg_rcvry("allocated LEB %d for GC", lnum); - return 0; -find_free: - /* - * There is no GC head LEB or the free space in the GC head LEB is too - * small, or there are not dirty LEBs. Allocate gc_lnum by calling - * 'ubifs_find_free_leb_for_idx()' so GC is not run. - */ - lnum = ubifs_find_free_leb_for_idx(c); - if (lnum < 0) { - dbg_err("could not find an empty LEB"); - return lnum; - } - /* And reset the index flag */ - err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, - LPROPS_INDEX, 0); - if (err) - return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); + dbg_rcvry("allocated LEB %d for GC", lp.lnum); + return 0; } /** @@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); if (err) goto out; - dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", (unsigned long)e->inum, lnum, offs, i_size, e->d_size); return 0; @@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c) e->i_size = le64_to_cpu(ino->size); } } + if (e->exists && e->i_size < e->d_size) { - if (!e->inode && c->ro_mount) { + if (c->ro_mount) { /* Fix the inode size and pin it in memory */ struct inode *inode; + struct ubifs_inode *ui; + + ubifs_assert(!e->inode); inode = ubifs_iget(c->vfs_sb, e->inum); if (IS_ERR(inode)) return PTR_ERR(inode); + + ui = ubifs_inode(inode); if (inode->i_size < e->d_size) { dbg_rcvry("ino %lu size %lld -> %lld", (unsigned long)e->inum, - e->d_size, inode->i_size); + inode->i_size, e->d_size); inode->i_size = e->d_size; - ubifs_inode(inode)->ui_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; e->inode = inode; this = rb_next(this); continue; @@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c) iput(e->inode); } } + this = rb_next(this); rb_erase(&e->rb, &c->size_tree); kfree(e); } + return 0; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index d3d6d365bfc..6617280d167 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -33,44 +33,32 @@ */ #include "ubifs.h" - -/* - * Replay flags. - * - * REPLAY_DELETION: node was deleted - * REPLAY_REF: node is a reference node - */ -enum { - REPLAY_DELETION = 1, - REPLAY_REF = 2, -}; +#include <linux/list_sort.h> /** - * struct replay_entry - replay tree entry. + * struct replay_entry - replay list entry. * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number - * @flags: replay flags - * @rb: links the replay tree + * @list: links the replay list * @key: node key * @nm: directory entry name * @old_size: truncation old size * @new_size: truncation new size - * @free: amount of free space in a bud - * @dirty: amount of dirty space in a bud from padding and deletion nodes - * @jhead: journal head number of the bud * - * UBIFS journal replay must compare node sequence numbers, which means it must - * build a tree of node information to insert into the TNC. + * The replay process first scans all buds and builds the replay list, then + * sorts the replay list in nodes sequence number order, and then inserts all + * the replay entries to the TNC. */ struct replay_entry { int lnum; int offs; int len; + unsigned int deletion:1; unsigned long long sqnum; - int flags; - struct rb_node rb; + struct list_head list; union ubifs_key key; union { struct qstr nm; @@ -78,11 +66,6 @@ struct replay_entry { loff_t old_size; loff_t new_size; }; - struct { - int free; - int dirty; - int jhead; - }; }; }; @@ -90,57 +73,64 @@ struct replay_entry { * struct bud_entry - entry in the list of buds to replay. * @list: next bud in the list * @bud: bud description object - * @free: free bytes in the bud * @sqnum: reference node sequence number + * @free: free bytes in the bud + * @dirty: dirty bytes in the bud */ struct bud_entry { struct list_head list; struct ubifs_bud *bud; - int free; unsigned long long sqnum; + int free; + int dirty; }; /** * set_bud_lprops - set free and dirty space used by a bud. * @c: UBIFS file-system description object - * @r: replay entry of bud + * @b: bud entry which describes the bud + * + * This function makes sure the LEB properties of bud @b are set correctly + * after the replay. Returns zero in case of success and a negative error code + * in case of failure. */ -static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) +static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) { const struct ubifs_lprops *lp; int err = 0, dirty; ubifs_get_lprops(c); - lp = ubifs_lpt_lookup_dirty(c, r->lnum); + lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum); if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } dirty = lp->dirty; - if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { /* * The LEB was added to the journal with a starting offset of * zero which means the LEB must have been empty. The LEB - * property values should be lp->free == c->leb_size and - * lp->dirty == 0, but that is not the case. The reason is that - * the LEB was garbage collected. The garbage collector resets - * the free and dirty space without recording it anywhere except - * lprops, so if there is not a commit then lprops does not have - * that information next time the file system is mounted. + * property values should be @lp->free == @c->leb_size and + * @lp->dirty == 0, but that is not the case. The reason is that + * the LEB had been garbage collected before it became the bud, + * and there was not commit inbetween. The garbage collector + * resets the free and dirty space without recording it + * anywhere except lprops, so if there was no commit then + * lprops does not have that information. * * We do not need to adjust free space because the scan has told * us the exact value which is recorded in the replay entry as - * r->free. + * @b->free. * * However we do need to subtract from the dirty space the * amount of space that the garbage collector reclaimed, which * is the whole LEB minus the amount of space that was free. */ - dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); - dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); dirty -= c->leb_size - lp->free; /* @@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) */ if (dirty != 0) dbg_msg("LEB %d lp: %d free %d dirty " - "replay: %d free %d dirty", r->lnum, lp->free, - lp->dirty, r->free, r->dirty); + "replay: %d free %d dirty", b->bud->lnum, + lp->free, lp->dirty, b->free, b->dirty); } - lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, + lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, lp->flags | LPROPS_TAKEN, 0); if (IS_ERR(lp)) { err = PTR_ERR(lp); @@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) } /* Make sure the journal head points to the latest bud */ - err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, - c->leb_size - r->free, UBI_SHORTTERM); + err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, + b->bud->lnum, c->leb_size - b->free, + UBI_SHORTTERM); out: ubifs_release_lprops(c); @@ -172,6 +163,27 @@ out: } /** + * set_buds_lprops - set free and dirty space for all replayed buds. + * @c: UBIFS file-system description object + * + * This function sets LEB properties for all replayed buds. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int set_buds_lprops(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + + list_for_each_entry(b, &c->replay_buds, list) { + err = set_bud_lprops(c, b); + if (err) + return err; + } + + return 0; +} + +/** * trun_remove_range - apply a replay entry for a truncation to the TNC. * @c: UBIFS file-system description object * @r: replay entry of truncation @@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) */ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { - int err, deletion = ((r->flags & REPLAY_DELETION) != 0); + int err; - dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); + dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, + r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; - if (r->flags & REPLAY_REF) - err = set_bud_lprops(c, r); - else if (is_hash_key(c, &r->key)) { - if (deletion) + if (is_hash_key(c, &r->key)) { + if (r->deletion) err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); else err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, r->len, &r->nm); } else { - if (deletion) + if (r->deletion) switch (key_type(c, &r->key)) { case UBIFS_INO_KEY: { @@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) return err; if (c->need_recovery) - err = ubifs_recover_size_accum(c, &r->key, deletion, + err = ubifs_recover_size_accum(c, &r->key, r->deletion, r->new_size); } @@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) } /** - * destroy_replay_tree - destroy the replay. - * @c: UBIFS file-system description object + * replay_entries_cmp - compare 2 replay entries. + * @priv: UBIFS file-system description object + * @a: first replay entry + * @a: second replay entry * - * Destroy the replay tree. + * This is a comparios function for 'list_sort()' which compares 2 replay + * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * greater sequence number and %-1 otherwise. */ -static void destroy_replay_tree(struct ubifs_info *c) +static int replay_entries_cmp(void *priv, struct list_head *a, + struct list_head *b) { - struct rb_node *this = c->replay_tree.rb_node; - struct replay_entry *r; - - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - r = rb_entry(this, struct replay_entry, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &r->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - if (is_hash_key(c, &r->key)) - kfree(r->nm.name); - kfree(r); - } - c->replay_tree = RB_ROOT; + struct replay_entry *ra, *rb; + + cond_resched(); + if (a == b) + return 0; + + ra = list_entry(a, struct replay_entry, list); + rb = list_entry(b, struct replay_entry, list); + ubifs_assert(ra->sqnum != rb->sqnum); + if (ra->sqnum > rb->sqnum) + return 1; + return -1; } /** - * apply_replay_tree - apply the replay tree to the TNC. + * apply_replay_list - apply the replay list to the TNC. * @c: UBIFS file-system description object * - * Apply the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. + * Apply all entries in the replay list to the TNC. Returns zero in case of + * success and a negative error code in case of failure. */ -static int apply_replay_tree(struct ubifs_info *c) +static int apply_replay_list(struct ubifs_info *c) { - struct rb_node *this = rb_first(&c->replay_tree); + struct replay_entry *r; + int err; - while (this) { - struct replay_entry *r; - int err; + list_sort(c, &c->replay_list, &replay_entries_cmp); + list_for_each_entry(r, &c->replay_list, list) { cond_resched(); - r = rb_entry(this, struct replay_entry, rb); err = apply_replay_entry(c, r); if (err) return err; - this = rb_next(this); } + return 0; } /** - * insert_node - insert a node to the replay tree. + * destroy_replay_list - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay list. + */ +static void destroy_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r, *tmp; + + list_for_each_entry_safe(r, tmp, &c->replay_list, list) { + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + list_del(&r->list); + kfree(r); + } +} + +/** + * insert_node - insert a node to the replay list * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c) * @old_size: truncation old size * @new_size: truncation new size * - * This function inserts a scanned non-direntry node to the replay tree. The - * replay tree is an RB-tree containing @struct replay_entry elements which are - * indexed by the sequence number. The replay tree is applied at the very end - * of the replay process. Since the tree is sorted in sequence number order, - * the older modifications are applied first. This function returns zero in - * case of success and a negative error code in case of failure. + * This function inserts a scanned non-direntry node to the replay list. The + * replay list contains @struct replay_entry elements, and we sort this list in + * sequence number order before applying it. The replay list is applied at the + * very end of the replay process. Since the list is sorted in sequence number + * order, the older modifications are applied first. This function returns zero + * in case of success and a negative error code in case of failure. */ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, unsigned long long sqnum, int deletion, int *used, loff_t old_size, loff_t new_size) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; @@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; - r->flags = (deletion ? REPLAY_DELETION : 0); + key_copy(c, key, &r->key); r->old_size = old_size; r->new_size = new_size; - key_copy(c, key, &r->key); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } /** - * insert_dent - insert a directory entry node into the replay tree. + * insert_dent - insert a directory entry node into the replay list. * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * @deletion: non-zero if this is a deletion * @used: number of bytes in use in a LEB * - * This function inserts a scanned directory entry node to the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. - * - * This function is also used for extended attribute entries because they are - * implemented as directory entry nodes. + * This function inserts a scanned directory entry node or an extended + * attribute entry to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, const char *name, int nlen, unsigned long long sqnum, int deletion, int *used) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; char *nbuf; + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } - if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; + nbuf = kmalloc(nlen + 1, GFP_KERNEL); if (!nbuf) { kfree(r); @@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; + key_copy(c, key, &r->key); r->nm.len = nlen; memcpy(nbuf, name, nlen); nbuf[nlen] = '\0'; r->nm.name = nbuf; - r->flags = (deletion ? REPLAY_DELETION : 0); - key_copy(c, key, &r->key); - ubifs_assert(!*p); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } @@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c, } /** + * is_last_bud - check if the bud is the last in the journal head. + * @c: UBIFS file-system description object + * @bud: bud description object + * + * This function checks if bud @bud is the last bud in its journal head. This + * information is then used by 'replay_bud()' to decide whether the bud can + * have corruptions or not. Indeed, only last buds can be corrupted by power + * cuts. Returns %1 if this is the last bud, and %0 if not. + */ +static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct ubifs_jhead *jh = &c->jheads[bud->jhead]; + struct ubifs_bud *next; + uint32_t data; + int err; + + if (list_is_last(&bud->list, &jh->buds_list)) + return 1; + + /* + * The following is a quirk to make sure we work correctly with UBIFS + * images used with older UBIFS. + * + * Normally, the last bud will be the last in the journal head's list + * of bud. However, there is one exception if the UBIFS image belongs + * to older UBIFS. This is fairly unlikely: one would need to use old + * UBIFS, then have a power cut exactly at the right point, and then + * try to mount this image with new UBIFS. + * + * The exception is: it is possible to have 2 buds A and B, A goes + * before B, and B is the last, bud B is contains no data, and bud A is + * corrupted at the end. The reason is that in older versions when the + * journal code switched the next bud (from A to B), it first added a + * log reference node for the new bud (B), and only after this it + * synchronized the write-buffer of current bud (A). But later this was + * changed and UBIFS started to always synchronize the write-buffer of + * the bud (A) before writing the log reference for the new bud (B). + * + * But because older UBIFS always synchronized A's write-buffer before + * writing to B, we can recognize this exceptional situation but + * checking the contents of bud B - if it is empty, then A can be + * treated as the last and we can recover it. + * + * TODO: remove this piece of code in a couple of years (today it is + * 16.05.2011). + */ + next = list_entry(bud->list.next, struct ubifs_bud, list); + if (!list_is_last(&next->list, &jh->buds_list)) + return 0; + + err = ubi_read(c->ubi, next->lnum, (char *)&data, + next->start, 4); + if (err) + return 0; + + return data == 0xFFFFFFFF; +} + +/** * replay_bud - replay a bud logical eraseblock. * @c: UBIFS file-system description object - * @lnum: bud logical eraseblock number to replay - * @offs: bud start offset - * @jhead: journal head to which this bud belongs - * @free: amount of free space in the bud is returned here - * @dirty: amount of dirty space from padding and deletion nodes is returned - * here + * @b: bud entry which describes the bud * - * This function returns zero in case of success and a negative error code in - * case of failure. + * This function replays bud @bud, recovers it if needed, and adds all nodes + * from this bud to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ -static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, - int *free, int *dirty) +static int replay_bud(struct ubifs_info *c, struct bud_entry *b) { - int err = 0, used = 0; + int is_last = is_last_bud(c, b->bud); + int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_bud *bud; - dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); - if (c->need_recovery) - sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); + dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d", + lnum, b->bud->jhead, offs, is_last); + + if (c->need_recovery && is_last) + /* + * Recover only last LEBs in the journal heads, because power + * cuts may cause corruptions only in these LEBs, because only + * these LEBs could possibly be written to at the power cut + * time. + */ + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, + b->bud->jhead != GCHD); else sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); if (IS_ERR(sleb)) @@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, goto out; } - bud = ubifs_search_bud(c, lnum); - if (!bud) - BUG(); - + ubifs_assert(ubifs_search_bud(c, lnum)); ubifs_assert(sleb->endpt - offs >= used); ubifs_assert(sleb->endpt % c->min_io_size == 0); - *dirty = sleb->endpt - offs - used; - *free = c->leb_size - sleb->endpt; + b->dirty = sleb->endpt - offs - used; + b->free = c->leb_size - sleb->endpt; + dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free); out: ubifs_scan_destroy(sleb); @@ -649,58 +694,6 @@ out_dump: } /** - * insert_ref_node - insert a reference node to the replay tree. - * @c: UBIFS file-system description object - * @lnum: node logical eraseblock number - * @offs: node offset - * @sqnum: sequence number - * @free: amount of free space in bud - * @dirty: amount of dirty space from padding and deletion nodes - * @jhead: journal head number for the bud - * - * This function inserts a reference node to the replay tree and returns zero - * in case of success or a negative error code in case of failure. - */ -static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, - unsigned long long sqnum, int free, int dirty, - int jhead) -{ - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; - struct replay_entry *r; - - dbg_mnt("add ref LEB %d:%d", lnum, offs); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay tree"); - return -EINVAL; - } - - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->lnum = lnum; - r->offs = offs; - r->sqnum = sqnum; - r->flags = REPLAY_REF; - r->free = free; - r->dirty = dirty; - r->jhead = jhead; - - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); - return 0; -} - -/** * replay_buds - replay all buds. * @c: UBIFS file-system description object * @@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, static int replay_buds(struct ubifs_info *c) { struct bud_entry *b; - int err, uninitialized_var(free), uninitialized_var(dirty); + int err; + unsigned long long prev_sqnum = 0; list_for_each_entry(b, &c->replay_buds, list) { - err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, - &free, &dirty); - if (err) - return err; - err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, - free, dirty, b->bud->jhead); + err = replay_bud(c, b); if (err) return err; + + ubifs_assert(b->sqnum > prev_sqnum); + prev_sqnum = b->sqnum; } return 0; @@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c) if (err) goto out; - err = apply_replay_tree(c); + err = apply_replay_list(c); + if (err) + goto out; + + err = set_buds_lprops(c); if (err) goto out; /* - * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable - * to roughly estimate index growth. Things like @c->min_idx_lebs + * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable + * to roughly estimate index growth. Things like @c->bi.min_idx_lebs * depend on it. This means we have to initialize it to make sure * budgeting works properly. */ - c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); - c->budg_uncommitted_idx *= c->max_idx_node_sz; + c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->bi.uncommitted_idx *= c->max_idx_node_sz; ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, (unsigned long)c->highest_inum); out: - destroy_replay_tree(c); + destroy_replay_list(c); destroy_bud_list(c); c->replaying = 0; return err; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index bf31b4729e5..c606f010e8d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -475,7 +475,8 @@ failed: * @c: UBIFS file-system description object * * This function returns a pointer to the superblock node or a negative error - * code. + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. */ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) { @@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c) c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); memcpy(&c->uuid, &sup->uuid, 16); c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); /* Automatically increase file system size to the maximum size */ c->old_leb_cnt = c->leb_cnt; @@ -650,3 +652,152 @@ out: kfree(sup); return err; } + +/** + * fixup_leb - fixup/unmap an LEB containing free space. + * @c: UBIFS file-system description object + * @lnum: the LEB number to fix up + * @len: number of used bytes in LEB (starting at offset 0) + * + * This function reads the contents of the given LEB number @lnum, then fixes + * it up, so that empty min. I/O units in the end of LEB are actually erased on + * flash (rather than being just all-0xff real data). If the LEB is completely + * empty, it is simply unmapped. + */ +static int fixup_leb(struct ubifs_info *c, int lnum, int len) +{ + int err; + + ubifs_assert(len >= 0); + ubifs_assert(len % c->min_io_size == 0); + ubifs_assert(len < c->leb_size); + + if (len == 0) { + dbg_mnt("unmap empty LEB %d", lnum); + return ubi_leb_unmap(c->ubi, lnum); + } + + dbg_mnt("fixup LEB %d, data len %d", lnum, len); + err = ubi_read(c->ubi, lnum, c->sbuf, 0, len); + if (err) + return err; + + return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); +} + +/** + * fixup_free_space - find & remap all LEBs containing free space. + * @c: UBIFS file-system description object + * + * This function walks through all LEBs in the filesystem and fiexes up those + * containing free/empty space. + */ +static int fixup_free_space(struct ubifs_info *c) +{ + int lnum, err = 0; + struct ubifs_lprops *lprops; + + ubifs_get_lprops(c); + + /* Fixup LEBs in the master area */ + for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) { + err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz); + if (err) + goto out; + } + + /* Unmap unused log LEBs */ + lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + while (lnum != c->ltail_lnum) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } + + /* Fixup the current log head */ + err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); + if (err) + goto out; + + /* Fixup LEBs in the LPT area */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + int free = c->ltab[lnum - c->lpt_first].free; + + if (free > 0) { + err = fixup_leb(c, lnum, c->leb_size - free); + if (err) + goto out; + } + } + + /* Unmap LEBs in the orphans area */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + } + + /* Fixup LEBs in the main area */ + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (lprops->free > 0) { + err = fixup_leb(c, lnum, c->leb_size - lprops->free); + if (err) + goto out; + } + } + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fixup_free_space - find & fix all LEBs with free space. + * @c: UBIFS file-system description object + * + * This function fixes up LEBs containing free space on first mount, if the + * appropriate flag was set when the FS was created. Each LEB with one or more + * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure + * the free space is actually erased. E.g., this is necessary for some NAND + * chips, since the free space may have been programmed like real "0xff" data + * (generating a non-0xff ECC), causing future writes to the not-really-erased + * NAND pages to behave badly. After the space is fixed up, the superblock flag + * is cleared, so that this is skipped for all future mounts. + */ +int ubifs_fixup_free_space(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + ubifs_assert(c->space_fixup); + ubifs_assert(!c->ro_mount); + + ubifs_msg("start fixing up free space"); + + err = fixup_free_space(c); + if (err) + return err; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* Free-space fixup is no longer required */ + c->space_fixup = 0; + sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); + + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + return err; + + ubifs_msg("free space fixup complete"); + return err; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 04ad07f4fcc..6db0bdaa9f7 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -375,7 +375,7 @@ out: ubifs_release_dirty_inode_budget(c, ui); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } done: @@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c) * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so - * it is not included into 'c->inode_budget'. + * it is not included into 'c->bi.inode_budget'. */ - c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; - c->inode_budget = UBIFS_INO_NODE_SZ; - c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; + c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->bi.inode_budget = UBIFS_INO_NODE_SZ; + c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes @@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c) { long long tmp64; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* @@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { - ubifs_err("insufficient free space to mount in read/write mode"); - dbg_dump_budg(c); + ubifs_err("insufficient free space to mount in R/W mode"); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); return -ENOSPC; } @@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_lpt; - err = dbg_check_idx_size(c, c->old_idx_sz); + err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; @@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_journal; /* Calculate 'min_idx_lebs' after journal replay */ - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) @@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c) } else ubifs_assert(c->lst.taken_empty_lebs > 0); + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_infos; + } + err = dbg_check_filesystem(c); if (err) goto out_infos; @@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c) c->main_lebs, c->main_first, c->leb_cnt - 1); dbg_msg("index LEBs: %d", c->lst.idx_lebs); dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", - c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); + c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, + c->bi.old_idx_sz >> 20); dbg_msg("key hash type: %d", c->key_hash_type); dbg_msg("tree fanout: %d", c->fanout); dbg_msg("reserved GC LEB: %d", c->gc_lnum); @@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", - UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); @@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) } sup->leb_cnt = cpu_to_le32(c->leb_cnt); err = ubifs_write_sb_node(c, sup); + kfree(sup); if (err) goto out; } @@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c) */ err = dbg_check_space_info(c); } + + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out; + } + mutex_unlock(&c->umount_mutex); return err; @@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb) * to write them back because of I/O errors. */ if (!c->ro_error) { - ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); - ubifs_assert(c->budg_idx_growth == 0); - ubifs_assert(c->budg_dd_growth == 0); - ubifs_assert(c->budg_data_growth == 0); + ubifs_assert(c->bi.idx_growth == 0); + ubifs_assert(c->bi.dd_growth == 0); + ubifs_assert(c->bi.data_growth == 0); } /* diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index de485979ca3..8119b1fd8d9 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, if (err) { /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } err = tnc_delete(c, znode, n); } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 53288e5d604..41920f357bb 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) c->gap_lebs = NULL; return err; } - if (!dbg_force_in_the_gaps_enabled) { + if (dbg_force_in_the_gaps_enabled()) { /* * Do not print scary warnings if the debugging * option which forces in-the-gaps is enabled. */ - ubifs_err("out of space"); - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + ubifs_warn("out of space"); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); } /* Try to commit anyway */ @@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) spin_lock(&c->space_lock); /* * Although we have not finished committing yet, update size of the - * committed index ('c->old_idx_sz') and zero out the index growth + * committed index ('c->bi.old_idx_sz') and zero out the index growth * budget. It is OK to do this now, because we've reserved all the * space which is needed to commit the index, and it is save for the * budgeting subsystem to assume the index is already committed, * even though it is not. */ - ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); - c->old_idx_sz = c->calc_idx_sz; - c->budg_uncommitted_idx = 0; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->bi.old_idx_sz = c->calc_idx_sz; + c->bi.uncommitted_idx = 0; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 191ca7863fe..e24380cf46e 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -408,9 +408,11 @@ enum { * Superblock flags. * * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed */ enum { UBIFS_FLG_BIGLPT = 0x02, + UBIFS_FLG_SPACE_FIXUP = 0x04, }; /** @@ -434,7 +436,7 @@ struct ubifs_ch { __u8 node_type; __u8 group_type; __u8 padding[2]; -} __attribute__ ((packed)); +} __packed; /** * union ubifs_dev_desc - device node descriptor. @@ -448,7 +450,7 @@ struct ubifs_ch { union ubifs_dev_desc { __le32 new; __le64 huge; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ino_node - inode node. @@ -509,7 +511,7 @@ struct ubifs_ino_node { __le16 compr_type; __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_dent_node - directory entry node. @@ -534,7 +536,7 @@ struct ubifs_dent_node { __le16 nlen; __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ __u8 name[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_data_node - data node. @@ -555,7 +557,7 @@ struct ubifs_data_node { __le16 compr_type; __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_trun_node - truncation node. @@ -575,7 +577,7 @@ struct ubifs_trun_node { __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ __le64 old_size; __le64 new_size; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_pad_node - padding node. @@ -586,7 +588,7 @@ struct ubifs_trun_node { struct ubifs_pad_node { struct ubifs_ch ch; __le32 pad_len; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_sb_node - superblock node. @@ -644,7 +646,7 @@ struct ubifs_sb_node { __u8 uuid[16]; __le32 ro_compat_version; __u8 padding2[3968]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_mst_node - master node. @@ -711,7 +713,7 @@ struct ubifs_mst_node { __le32 idx_lebs; __le32 leb_cnt; __u8 padding[344]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ref_node - logical eraseblock reference node. @@ -727,7 +729,7 @@ struct ubifs_ref_node { __le32 offs; __le32 jhead; __u8 padding[28]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_branch - key/reference/length branch @@ -741,7 +743,7 @@ struct ubifs_branch { __le32 offs; __le32 len; __u8 key[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_idx_node - indexing node. @@ -755,7 +757,7 @@ struct ubifs_idx_node { __le16 child_cnt; __le16 level; __u8 branches[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_cs_node - commit start node. @@ -765,7 +767,7 @@ struct ubifs_idx_node { struct ubifs_cs_node { struct ubifs_ch ch; __le64 cmt_no; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_orph_node - orphan node. @@ -777,6 +779,6 @@ struct ubifs_orph_node { struct ubifs_ch ch; __le64 cmt_no; __le64 inos[]; -} __attribute__ ((packed)); +} __packed; #endif /* __UBIFS_MEDIA_H__ */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 8c40ad3c672..93d1412a06f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb { * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot * make sure @inode->i_size is always changed under @ui_mutex, because it - * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock - * with 'ubifs_writepage()' (see file.c). All the other inode fields are - * changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would + * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields + * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one * could consider to rework locking and base it on "shadow" fields. */ struct ubifs_inode { @@ -937,6 +937,40 @@ struct ubifs_mount_opts { unsigned int compr_type:2; }; +/** + * struct ubifs_budg_info - UBIFS budgeting information. + * @idx_growth: amount of bytes budgeted for index growth + * @data_growth: amount of bytes budgeted for cached data + * @dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but + * which still have to be taken into account because the index + * has not been committed so far + * @old_idx_sz: size of index on flash + * @min_idx_lebs: minimum number of LEBs required for the index + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * @page_budget: budget for a page (constant, nenver changed after mount) + * @inode_budget: budget for an inode (constant, nenver changed after mount) + * @dent_budget: budget for a directory entry (constant, nenver changed after + * mount) + */ +struct ubifs_budg_info { + long long idx_growth; + long long data_growth; + long long dd_growth; + long long uncommitted_idx; + unsigned long long old_idx_sz; + int min_idx_lebs; + unsigned int nospace:1; + unsigned int nospace_rp:1; + int page_budget; + int inode_budget; + int dent_budget; +}; + struct ubifs_debug_info; /** @@ -980,6 +1014,7 @@ struct ubifs_debug_info; * @cmt_wq: wait queue to sleep on if the log is full and a commit is running * * @big_lpt: flag that LPT is too big to write whole during commit + * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) * @bulk_read: enable bulk-reads @@ -1057,32 +1092,14 @@ struct ubifs_debug_info; * @dirty_zn_cnt: number of dirty znodes * @clean_zn_cnt: number of clean znodes * - * @budg_idx_growth: amount of bytes budgeted for index growth - * @budg_data_growth: amount of bytes budgeted for cached data - * @budg_dd_growth: amount of bytes budgeted for cached data that will make - * other data dirty - * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index, - * but which still have to be taken into account because - * the index has not been committed so far - * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, - * @nospace, and @nospace_rp; - * @min_idx_lebs: minimum number of LEBs required for the index - * @old_idx_sz: size of index on flash + * @space_lock: protects @bi and @lst + * @lst: lprops statistics + * @bi: budgeting information * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) - * @lst: lprops statistics - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full - * - * @page_budget: budget for a page - * @inode_budget: budget for an inode - * @dent_budget: budget for a directory entry * * @ref_node_alsz: size of the LEB reference node aligned to the min. flash - * I/O unit + * I/O unit * @mst_node_alsz: master node aligned size * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary @@ -1189,7 +1206,6 @@ struct ubifs_debug_info; * @replaying: %1 during journal replay * @mounting: %1 while mounting * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode - * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) @@ -1238,6 +1254,7 @@ struct ubifs_info { wait_queue_head_t cmt_wq; unsigned int big_lpt:1; + unsigned int space_fixup:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; @@ -1308,21 +1325,10 @@ struct ubifs_info { atomic_long_t dirty_zn_cnt; atomic_long_t clean_zn_cnt; - long long budg_idx_growth; - long long budg_data_growth; - long long budg_dd_growth; - long long budg_uncommitted_idx; spinlock_t space_lock; - int min_idx_lebs; - unsigned long long old_idx_sz; - unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; - unsigned int nospace:1; - unsigned int nospace_rp:1; - - int page_budget; - int inode_budget; - int dent_budget; + struct ubifs_budg_info bi; + unsigned long long calc_idx_sz; int ref_node_alsz; int mst_node_alsz; @@ -1430,7 +1436,6 @@ struct ubifs_info { unsigned int replaying:1; unsigned int mounting:1; unsigned int remounting_rw:1; - struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; @@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c); int ubifs_read_superblock(struct ubifs_info *c); struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); +int ubifs_fixup_free_space(struct ubifs_info *c); /* replay.c */ int ubifs_validate_entry(struct ubifs_info *c, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3299f469e71..16f19f55e63 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -80,8 +80,8 @@ enum { SECURITY_XATTR, }; -static const struct inode_operations none_inode_operations; -static const struct file_operations none_file_operations; +static const struct inode_operations empty_iops; +static const struct file_operations empty_fops; /** * create_xattr - create an extended attribute. @@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, /* Re-define all operations to be "nothing" */ inode->i_mapping->a_ops = &empty_aops; - inode->i_op = &none_inode_operations; - inode->i_fop = &none_file_operations; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; ui = ubifs_inode(inode); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index f1dce848ef9..4d76594c2a8 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; + dentry_unhash(dentry); + retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); if (!fi) @@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { if (ofibh.sbh != ofibh.ebh) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 46f7a807bbc..42694e11c23 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -424,8 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_cpu_to_data_ptr(sb, p, result); *err = 0; UFS_I(inode)->i_lastfrag = - max_t(u32, UFS_I(inode)->i_lastfrag, - fragment + count); + max(UFS_I(inode)->i_lastfrag, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } @@ -440,7 +439,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); if (result) { *err = 0; - UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); unlock_super(sb); @@ -479,7 +479,8 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, uspi->s_sbbase + result, locked_page); ufs_cpu_to_data_ptr(sb, p, result); *err = 0; - UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); + UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, + fragment + count); unlock_super(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e765743cf9f..b4d791a8320 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -409,7 +409,7 @@ out: } /** - * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and + * ufs_getfrag_block() - `get_block_t' function, interface between UFS and * readpage, writepage and so on */ diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29309e25417..953ebdfc5bf 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = dentry->d_inode; int err= -ENOTEMPTY; + dentry_unhash(dentry); + lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); @@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + if (new_inode && S_ISDIR(new_inode->i_mode)) + dentry_unhash(new_dentry); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 5f821dbc057..f04f89fbd4d 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -84,7 +84,7 @@ static int ufs_trunc_direct(struct inode *inode) retry = 0; frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u32, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); frag3 = frag4 & ~uspi->s_fpbmask; block1 = block2 = 0; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9ef9ed2cfe2..5e68099db2a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -33,7 +33,6 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> -#include <linux/list_sort.h> #include "xfs_sb.h" #include "xfs_inum.h" @@ -709,6 +708,27 @@ xfs_buf_get_empty( return bp; } +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t len) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_file_offset = 0; + bp->b_buffer_length = bp->b_count_desired = len; + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_flags &= ~XBF_MAPPED; +} + static inline struct page * mem_to_page( void *addr) @@ -1402,12 +1422,12 @@ restart: int xfs_buftarg_shrink( struct shrinker *shrink, - int nr_to_scan, - gfp_t mask) + struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); struct xfs_buf *bp; + int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); if (!nr_to_scan) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a9a1c451264..50a7d5fb3b7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c index d61611c8801..244e797dae3 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -191,3 +191,32 @@ xfs_ioc_trim( return -XFS_ERROR(EFAULT); return 0; } + +int +xfs_discard_extents( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_busy_extent *busyp; + int error = 0; + + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0); + if (error && error != EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llu,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + return error; + } + } + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h index e82b6dd3e12..344879aea64 100644 --- a/fs/xfs/linux-2.6/xfs_discard.h +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -2,7 +2,9 @@ #define XFS_DISCARD_H 1 struct fstrim_range; +struct list_head; extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index b3486dfa552..54e623bfbb8 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -586,7 +586,8 @@ xfs_file_compat_ioctl( case XFS_IOC_RESVSP_32: case XFS_IOC_UNRESVSP_32: case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: { + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { struct xfs_flock64 bf; if (xfs_compat_flock64_copyin(&bf, arg)) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h index 08b605792a9..80f4060e897 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 { #define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) #define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) #define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 244be9cbfe7..8633521b3b2 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -70,6 +70,7 @@ #include <linux/ctype.h> #include <linux/writeback.h> #include <linux/capability.h> +#include <linux/list_sort.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c index 9f76cceb678..bd672def95a 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -41,23 +41,6 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -void xfs_printk( - const char *level, - const struct xfs_mount *mp, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - __xfs_printk(level, mp, &vaf); - va_end(args); -} - #define define_xfs_printk_level(func, kern_level) \ void func(const struct xfs_mount *mp, const char *fmt, ...) \ { \ @@ -95,8 +78,7 @@ xfs_alert_tag( int do_panic = 0; if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { - xfs_printk(KERN_ALERT, mp, - "XFS: Transforming an alert into a BUG."); + xfs_alert(mp, "Transforming an alert into a BUG."); do_panic = 1; } diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h index f1b3fc1b6c4..7fb7ea00767 100644 --- a/fs/xfs/linux-2.6/xfs_message.h +++ b/fs/xfs/linux-2.6/xfs_message.h @@ -3,9 +3,6 @@ struct xfs_mount; -extern void xfs_printk(const char *level, const struct xfs_mount *mp, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) @@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); #else -static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +static inline void +__attribute__ ((format (printf, 2, 3))) +xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) { } #endif diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index b38e58d0229..98b9c91fcdf 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ +#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ +#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ /* * Table driven mount option parser. @@ -355,6 +357,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; + } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + mp->m_flags |= XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + mp->m_flags &= ~XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, "ihashsize")) { xfs_warn(mp, "ihashsize no longer used, option is deprecated."); @@ -388,6 +394,13 @@ xfs_parseargs( return EINVAL; } + if ((mp->m_flags & XFS_MOUNT_DISCARD) && + !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { + xfs_warn(mp, + "the discard option is incompatible with the nodelaylog option"); + return EINVAL; + } + #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -488,6 +501,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, + { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1787,10 +1801,6 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; - error = xfs_init_workqueues(); - if (error) - goto out_sysctl_unregister; - vfs_initquota(); error = register_filesystem(&xfs_fs_type); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 3e898a48122..8ecad5ff9f9 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -267,6 +267,16 @@ xfs_sync_inode_attr( error = xfs_iflush(ip, flags); + /* + * We don't want to try again on non-blocking flushes that can't run + * again immediately. If an inode really must be written, then that's + * what the SYNC_WAIT flag is for. + */ + if (error == EAGAIN) { + ASSERT(!(flags & SYNC_WAIT)); + error = 0; + } + out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; @@ -1022,13 +1032,14 @@ xfs_reclaim_inodes( static int xfs_reclaim_inode_shrink( struct shrinker *shrink, - int nr_to_scan, - gfp_t gfp_mask) + struct shrink_control *sc) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; int reclaimable; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 2d0bcb47907..d48b7a579ae 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap, ); -#define XFS_BUSY_SYNC \ - { 0, "async" }, \ - { 1, "sync" } - -TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, int sync), - TP_ARGS(trans, agno, agbno, len, sync), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(struct xfs_trans *, tp) - __field(int, tid) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) - __field(xfs_extlen_t, len) - __field(int, sync) - ), - TP_fast_assign( - __entry->dev = trans->t_mountp->m_super->s_dev; - __entry->tp = trans; - __entry->tid = trans->t_ticket->t_tid; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->len = len; - __entry->sync = sync; - ), - TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->tp, - __entry->tid, - __entry->agno, - __entry->agbno, - __entry->len, - __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) - -); - -TRACE_EVENT(xfs_alloc_unbusy, +DECLARE_EVENT_CLASS(xfs_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy, __entry->agbno, __entry->len) ); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_alloc_busy); +DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); +DEFINE_BUSY_EVENT(xfs_alloc_busy_force); +DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); +DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); -#define XFS_BUSY_STATES \ - { 0, "missing" }, \ - { 1, "found" } - -TRACE_EVENT(xfs_alloc_busysearch, +TRACE_EVENT(xfs_alloc_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, int found), - TP_ARGS(mp, agno, agbno, len, found), + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, found) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->found = found; + __entry->tbno = tbno; + __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno %u agbno %u len %u %s", + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->tbno, + __entry->tlen) ); TRACE_EVENT(xfs_trans_commit_lsn, @@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->wasfromfl, __entry->isfl, __entry->userdata, - __entry->firstblock) + (unsigned long long)__entry->firstblock) ) #define DEFINE_ALLOC_EVENT(name) \ @@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first); DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_size_done); DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); DEFINE_ALLOC_EVENT(xfs_alloc_small_done); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 69228aa8605..b94dace4e78 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist( STATIC int xfs_qm_shake( struct shrinker *shrink, - int nr_to_scan, - gfp_t gfp_mask) + struct shrink_control *sc) { int ndqused, nfree, n; + gfp_t gfp_mask = sc->gfp_mask; if (!kmem_shake_allow(gfp_mask)) return 0; diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 58632cc17f2..6530769a999 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -187,7 +187,9 @@ struct xfs_busy_extent { xfs_agnumber_t agno; xfs_agblock_t bno; xfs_extlen_t length; - xlog_tid_t tid; /* transaction that created this */ + unsigned int flags; +#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 27d64d752ea..95862bbff56 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,19 +41,13 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -/* - * Prototypes for per-ag allocation routines - */ - STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); - -/* - * Internal functions. - */ + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, + xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -154,19 +148,21 @@ xfs_alloc_compute_aligned( xfs_extlen_t *reslen) /* result length */ { xfs_agblock_t bno; - xfs_extlen_t diff; xfs_extlen_t len; - if (args->alignment > 1 && foundlen >= args->minlen) { - bno = roundup(foundbno, args->alignment); - diff = bno - foundbno; - len = diff >= foundlen ? 0 : foundlen - diff; + /* Trim busy sections out of found extent */ + xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; } else { - bno = foundbno; - len = foundlen; + *resbno = bno; + *reslen = len; } - *resbno = bno; - *reslen = len; } /* @@ -280,7 +276,6 @@ xfs_alloc_fix_minleft( return 1; agf = XFS_BUF_TO_AGF(args->agbp); diff = be32_to_cpu(agf->agf_freeblks) - + be32_to_cpu(agf->agf_flcount) - args->len - args->minleft; if (diff >= 0) return 1; @@ -541,16 +536,8 @@ xfs_alloc_ag_vextent( if (error) return error; - /* - * Search the busylist for these blocks and mark the - * transaction as synchronous if blocks are found. This - * avoids the need to block due to a synchronous log - * force to ensure correct ordering as the synchronous - * transaction will guarantee that for us. - */ - if (xfs_alloc_busy_search(args->mp, args->agno, - args->agbno, args->len)) - xfs_trans_set_sync(args->tp); + ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)); } if (!args->isfl) { @@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact( { xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ - xfs_agblock_t end; /* end of allocated extent */ int error; xfs_agblock_t fbno; /* start block of found extent */ - xfs_agblock_t fend; /* end block of found extent */ xfs_extlen_t flen; /* length of found extent */ + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t end; /* end of allocated extent */ int i; /* success/failure of operation */ - xfs_agblock_t maxend; /* end of maximal extent */ - xfs_agblock_t minend; /* end of minimal extent */ xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); @@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact( goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); ASSERT(fbno <= args->agbno); - minend = args->agbno + args->minlen; - maxend = args->agbno + args->maxlen; - fend = fbno + flen; /* - * Give up if the freespace isn't long enough for the minimum request. + * Check for overlapping busy extents. */ - if (fend < minend) + xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + + /* + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. + */ + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) goto not_found; /* @@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact( * * Fix the length according to mod and prod if given. */ - end = XFS_AGBLOCK_MIN(fend, maxend); + end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen); args->len = end - args->agbno; xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) goto not_found; rlen = args->len; - ASSERT(args->agbno + rlen <= fend); + ASSERT(args->agbno + rlen <= tend); end = args->agbno + rlen; /* @@ -686,11 +681,11 @@ xfs_alloc_find_best_extent( struct xfs_btree_cur **scur, /* searching cursor */ xfs_agblock_t gdiff, /* difference for search comparison */ xfs_agblock_t *sbno, /* extent found by search */ - xfs_extlen_t *slen, - xfs_extlen_t *slena, /* aligned length */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ int dir) /* 0 = search right, 1 = search left */ { - xfs_agblock_t bno; xfs_agblock_t new; xfs_agblock_t sdiff; int error; @@ -708,16 +703,16 @@ xfs_alloc_find_best_extent( if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* * The good extent is closer than this one. */ if (!dir) { - if (bno >= args->agbno + gdiff) + if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { - if (bno <= args->agbno - gdiff) + if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -729,8 +724,8 @@ xfs_alloc_find_best_extent( xfs_alloc_fix_len(args); sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, *sbno, - *slen, &new); + args->alignment, *sbnoa, + *slena, &new); /* * Choose closer size and invalidate other cursor. @@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena = 0; /* aligned ... */ + xfs_extlen_t gtlena; /* aligned ... */ xfs_agblock_t gtnew; /* useful start bno of right side */ int error; /* error code */ int i; /* result code, temporary */ @@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena = 0; /* aligned ... */ + xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; #if defined(DEBUG) && defined(__KERNEL__) /* * Randomly don't execute the first algorithm. @@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near( dofirst = random32() & 1; #endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + /* * Get a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ltlen = 0; - bno_cur_lt = bno_cur_gt = NULL; + /* * See if there are any free extents as big as maxlen. */ @@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near( goto error0; if (i == 0 || ltlen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); return 0; } ASSERT(i == 1); } args->wasfromfl = 0; + /* * First algorithm. * If the requested extent is large wrt the freespaces available @@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near( if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbno, ltlen, <new); + args->alignment, ltbnoa, ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near( args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbno, ltlen, <new); + args->alignment, ltbnoa, ltlena, <new); error = xfs_alloc_find_best_extent(args, &bno_cur_lt, &bno_cur_gt, - ltdiff, >bno, >len, >lena, + ltdiff, >bno, >len, + >bnoa, >lena, 0 /* search right */); } else { ASSERT(gtlena >= args->minlen); @@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near( args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, gtbno, gtlen, >new); + args->alignment, gtbnoa, gtlena, >new); error = xfs_alloc_find_best_extent(args, &bno_cur_gt, &bno_cur_lt, - gtdiff, <bno, <len, <lena, + gtdiff, <bno, <len, + <bnoa, <lena, 1 /* search left */); } @@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near( * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; @@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near( return 0; } rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, - ltlen, <new); + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltbno + ltlen); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->agbno = ltnew; + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, ltnew, rlen, XFSA_FIXUP_BNO_OK))) goto error0; @@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size( int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +restart: /* * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + /* * Look for an entry >= maxlen+alignment-1 blocks. */ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen + args->alignment - 1, &i))) goto error0; + /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, - &flen, &i))) + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) goto error0; if (i == 0 || flen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); @@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size( return 0; } ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } } - /* - * There's a freespace as big as maxlen+alignment-1, get it. - */ - else { - if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } + /* * In the first case above, we got the last entry in the * by-size btree. Now we check to see if the space hits maxlen * once aligned; if not, we search left for something better. * This can't happen in the second case above. */ - xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); @@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size( * Fix up the length. */ args->len = rlen; - xfs_alloc_fix_len(args); - if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_size_nominleft(args); - args->agbno = NULLAGBLOCK; - return 0; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); /* @@ -1287,6 +1350,12 @@ error0: if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; } /* @@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { + xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + if (args->userdata) { xfs_buf_t *bp; @@ -1617,18 +1689,6 @@ xfs_free_ag_extent( trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); - /* - * Since blocks move to the free list without the coordination - * used in xfs_bmap_finish, we can't allow block to be available - * for reallocation and non-transaction writing (user data) - * until we know that the transaction that moved it to the free - * list is permanently on disk. We track the blocks by declaring - * these blocks as "busy"; the busy list is maintained on a per-ag - * basis and each transaction records which entries should be removed - * when the iclog commits to disk. If a busy block is allocated, - * the iclog is pushed up to the LSN that freed the block. - */ - xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist( xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; - /* - * As blocks are freed, they are added to the per-ag busy list and - * remain there until the freeing transaction is committed to disk. - * Now that we have allocated blocks, this list must be searched to see - * if a block is being reused. If one is, then the freeing transaction - * must be pushed to disk before this transaction. - * - * We do this by setting the current transaction to a sync transaction - * which guarantees that the freeing transaction is on disk before this - * transaction. This is done instead of a synchronous log force here so - * that we don't sit and wait with the AGF locked in the transaction - * during the log force. - */ - if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) - xfs_trans_set_sync(tp); return 0; } @@ -2423,119 +2468,26 @@ xfs_free_extent( } error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); return error; } - -/* - * AG Busy list management - * The busy list contains block ranges that have been freed but whose - * transactions have not yet hit disk. If any block listed in a busy - * list is reused, the transaction that freed it must be forced to disk - * before continuing to use the block. - * - * xfs_alloc_busy_insert - add to the per-ag busy list - * xfs_alloc_busy_clear - remove an item from the per-ag busy list - * xfs_alloc_busy_search - search for a busy extent - */ - -/* - * Insert a new extent into the busy tree. - * - * The busy extent tree is indexed by the start block of the busy extent. - * there can be multiple overlapping ranges in the busy extent tree but only - * ever one entry at a given start block. The reason for this is that - * multi-block extents can be freed, then smaller chunks of that extent - * allocated and freed again before the first transaction commit is on disk. - * If the exact same start block is freed a second time, we have to wait for - * that busy extent to pass out of the tree before the new extent is inserted. - * There are two main cases we have to handle here. - * - * The first case is a transaction that triggers a "free - allocate - free" - * cycle. This can occur during btree manipulations as a btree block is freed - * to the freelist, then allocated from the free list, then freed again. In - * this case, the second extxpnet free is what triggers the duplicate and as - * such the transaction IDs should match. Because the extent was allocated in - * this transaction, the transaction must be marked as synchronous. This is - * true for all cases where the free/alloc/free occurs in the one transaction, - * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. - * This serves to catch violations of the second case quite effectively. - * - * The second case is where the free/alloc/free occur in different - * transactions. In this case, the thread freeing the extent the second time - * can't mark the extent busy immediately because it is already tracked in a - * transaction that may be committing. When the log commit for the existing - * busy extent completes, the busy extent will be removed from the tree. If we - * allow the second busy insert to continue using that busy extent structure, - * it can be freed before this transaction is safely in the log. Hence our - * only option in this case is to force the log to remove the existing busy - * extent from the list before we insert the new one with the current - * transaction ID. - * - * The problem we are trying to avoid in the free-alloc-free in separate - * transactions is most easily described with a timeline: - * - * Thread 1 Thread 2 Thread 3 xfslogd - * xact alloc - * free X - * mark busy - * commit xact - * free xact - * xact alloc - * alloc X - * busy search - * mark xact sync - * commit xact - * free xact - * force log - * checkpoint starts - * .... - * xact alloc - * free X - * mark busy - * finds match - * *** KABOOM! *** - * .... - * log IO completes - * unbusy X - * checkpoint completes - * - * By issuing a log force in thread 3 @ "KABOOM", the thread will block until - * the checkpoint completes, and the busy extent it matched will have been - * removed from the tree when it is woken. Hence it can then continue safely. - * - * However, to ensure this matching process is robust, we need to use the - * transaction ID for identifying transaction, as delayed logging results in - * the busy extent and transaction lifecycles being different. i.e. the busy - * extent is active for a lot longer than the transaction. Hence the - * transaction structure can be freed and reallocated, then mark the same - * extent busy again in the new transaction. In this case the new transaction - * will have a different tid but can have the same address, and hence we need - * to check against the tid. - * - * Future: for delayed logging, we could avoid the log force if the extent was - * first freed in the current checkpoint sequence. This, however, requires the - * ability to pin the current checkpoint in memory until this transaction - * commits to ensure that both the original free and the current one combine - * logically into the one checkpoint. If the checkpoint sequences are - * different, however, we still need to wait on a log force. - */ void xfs_alloc_busy_insert( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, - xfs_extlen_t len) + xfs_extlen_t len, + unsigned int flags) { struct xfs_busy_extent *new; struct xfs_busy_extent *busyp; struct xfs_perag *pag; struct rb_node **rbp; - struct rb_node *parent; - int match; - + struct rb_node *parent = NULL; new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); if (!new) { @@ -2544,7 +2496,7 @@ xfs_alloc_busy_insert( * block, make this a synchronous transaction to insure that * the block is not reused before this transaction commits. */ - trace_xfs_alloc_busy(tp, agno, bno, len, 1); + trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); xfs_trans_set_sync(tp); return; } @@ -2552,66 +2504,29 @@ xfs_alloc_busy_insert( new->agno = agno; new->bno = bno; new->length = len; - new->tid = xfs_log_get_trans_ident(tp); - INIT_LIST_HEAD(&new->list); + new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp, agno, bno, len, 0); + trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); pag = xfs_perag_get(tp->t_mountp, new->agno); -restart: spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; - parent = NULL; - busyp = NULL; - match = 0; - while (*rbp && match >= 0) { + while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); if (new->bno < busyp->bno) { - /* may overlap, but exact start block is lower */ rbp = &(*rbp)->rb_left; - if (new->bno + new->length > busyp->bno) - match = busyp->tid == new->tid ? 1 : -1; + ASSERT(new->bno + new->length <= busyp->bno); } else if (new->bno > busyp->bno) { - /* may overlap, but exact start block is higher */ rbp = &(*rbp)->rb_right; - if (bno < busyp->bno + busyp->length) - match = busyp->tid == new->tid ? 1 : -1; + ASSERT(bno >= busyp->bno + busyp->length); } else { - match = busyp->tid == new->tid ? 1 : -1; - break; + ASSERT(0); } } - if (match < 0) { - /* overlap marked busy in different transaction */ - spin_unlock(&pag->pagb_lock); - xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); - goto restart; - } - if (match > 0) { - /* - * overlap marked busy in same transaction. Update if exact - * start block match, otherwise combine the busy extents into - * a single range. - */ - if (busyp->bno == new->bno) { - busyp->length = max(busyp->length, new->length); - spin_unlock(&pag->pagb_lock); - ASSERT(tp->t_flags & XFS_TRANS_SYNC); - xfs_perag_put(pag); - kmem_free(new); - return; - } - rb_erase(&busyp->rb_node, &pag->pagb_tree); - new->length = max(busyp->bno + busyp->length, - new->bno + new->length) - - min(busyp->bno, new->bno); - new->bno = min(busyp->bno, new->bno); - } else - busyp = NULL; rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); @@ -2619,7 +2534,6 @@ restart: list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - kmem_free(busyp); } /* @@ -2668,31 +2582,466 @@ xfs_alloc_busy_search( } } spin_unlock(&pag->pagb_lock); - trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); return match; } +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_alloc_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ void -xfs_alloc_busy_clear( +xfs_alloc_busy_reuse( struct xfs_mount *mp, - struct xfs_busy_extent *busyp) + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) { struct xfs_perag *pag; + struct rb_node *rbp; - trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, - busyp->length); + ASSERT(flen > 0); - ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, - busyp->length) == 1); + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; - list_del_init(&busyp->list); + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - rb_erase(&busyp->rb_node, &pag->pagb_tree); + if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +STATIC void +xfs_alloc_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { + if (!xfs_alloc_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +static void +xfs_alloc_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp) +{ + if (busyp->length) { + trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + list_del_init(&busyp->list); kmem_free(busyp); } + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_busy_extent *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_ALLOC_BUSY_DISCARDED; + else + xfs_alloc_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_busy_extent_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_busy_extent, list)->agno - + container_of(b, struct xfs_busy_extent, list)->agno; +} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index d0b3bc72005..2f52b924be7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -137,14 +137,28 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); int xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +int +xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_alloc_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_busy_extent_ag_cmp); +} + #endif /* __KERNEL__ */ /* diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 3916925e258..2b3518826a6 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -95,6 +95,8 @@ xfs_allocbt_alloc_block( return 0; } + xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -118,18 +120,8 @@ xfs_allocbt_free_block( if (error) return error; - /* - * Since blocks move to the free list without the coordination used in - * xfs_bmap_finish, we can't allow block to be available for - * reallocation and non-transaction writing (user data) until we know - * that the transaction that moved it to the free list is permanently - * on disk. We track the blocks by declaring these blocks as "busy"; - * the busy list is maintained on a per-ag basis and each transaction - * records which entries should be removed when the iclog commits to - * disk. If a busy block is allocated, the iclog is pushed up to the - * LSN that freed the block. - */ - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_ALLOC_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index fa00788de2f..e546a33214c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local( int *flags); /* inode logging flags */ /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_add_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - int rsvd); /* OK to allocate reserved blocks */ - -/* * Called by xfs_bmap_add_extent to handle cases converting a delayed * allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int rsvd); /* OK to allocate reserved blocks */ + int *logflagsp); /* inode logging flags */ /* * Called by xfs_bmap_add_extent to handle cases converting a hole @@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real( STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp,/* inode logging flags */ - int rsvd); /* OK to allocate reserved blocks */ + int *logflagsp); /* inode logging flags */ /* * Called by xfs_bmap_add_extent to handle cases converting a hole @@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay( STATIC int /* error */ xfs_bmap_add_extent_hole_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ @@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real( STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp); /* inode logging flags */ @@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents( int whichfork); /* data or attr fork */ /* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current trans pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp,/* inode logging flags */ - int whichfork, /* data or attr fork */ - int rsvd); /* OK to allocate reserved blocks */ - -/* * Remove the entry "free" from the free item list. Prev points to the * previous entry, unless "free" is the head of the list. */ @@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local( STATIC int /* error */ xfs_bmap_add_extent( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - int rsvd) /* OK to use reserved data blocks */ + int whichfork) /* data or attr fork */ { xfs_btree_cur_t *cur; /* btree cursor or null */ xfs_filblks_t da_new; /* new count del alloc blocks used */ @@ -492,23 +457,27 @@ xfs_bmap_add_extent( xfs_extnum_t nextents; /* number of extents in file now */ XFS_STATS_INC(xs_add_exlist); + cur = *curp; ifp = XFS_IFORK_PTR(ip, whichfork); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(idx <= nextents); da_old = da_new = 0; error = 0; + + ASSERT(*idx >= 0); + ASSERT(*idx <= nextents); + /* * This is the first extent added to a new/empty file. * Special case this one, so other routines get to assume there are * already extents in the list. */ if (nextents == 0) { - xfs_iext_insert(ip, 0, 1, new, + xfs_iext_insert(ip, *idx, 1, new, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); ASSERT(cur == NULL); - ifp->if_lastex = 0; + if (!isnullstartblock(new->br_startblock)) { XFS_IFORK_NEXT_SET(ip, whichfork, 1); logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); @@ -522,27 +491,25 @@ xfs_bmap_add_extent( if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags, rsvd))) - goto done; + error = xfs_bmap_add_extent_hole_delay(ip, idx, new, + &logflags); } /* * Real allocation off the end of the file. */ - else if (idx == nextents) { + else if (*idx == nextents) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, whichfork))) - goto done; + error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, + &logflags, whichfork); } else { xfs_bmbt_irec_t prev; /* old extent at offset idx */ /* * Get the record referred to by idx. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev); /* * If it's a real allocation record, and the new allocation ends * after the start of the referred to record, then we're filling @@ -557,22 +524,18 @@ xfs_bmap_add_extent( if (cur) ASSERT(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL); - if ((error = xfs_bmap_add_extent_delay_real(ip, - idx, &cur, new, &da_new, first, flist, - &logflags, rsvd))) - goto done; - } else if (new->br_state == XFS_EXT_NORM) { - ASSERT(new->br_state == XFS_EXT_NORM); - if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags))) - goto done; + error = xfs_bmap_add_extent_delay_real(ip, + idx, &cur, new, &da_new, + first, flist, &logflags); } else { - ASSERT(new->br_state == XFS_EXT_UNWRITTEN); - if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags))) + ASSERT(new->br_state == XFS_EXT_NORM || + new->br_state == XFS_EXT_UNWRITTEN); + + error = xfs_bmap_add_extent_unwritten_real(ip, + idx, &cur, new, &logflags); + if (error) goto done; } - ASSERT(*curp == cur || *curp == NULL); } /* * Otherwise we're filling in a hole with an allocation. @@ -581,13 +544,15 @@ xfs_bmap_add_extent( if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, whichfork))) - goto done; + error = xfs_bmap_add_extent_hole_real(ip, idx, cur, + new, &logflags, whichfork); } } + if (error) + goto done; ASSERT(*curp == cur || *curp == NULL); + /* * Convert to a btree if necessary. */ @@ -615,7 +580,7 @@ xfs_bmap_add_extent( ASSERT(nblks <= da_old); if (nblks < da_old) xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - nblks), rsvd); + (int64_t)(da_old - nblks), 0); } /* * Clear out the allocated field, done with it now in any case. @@ -640,14 +605,13 @@ done: STATIC int /* error */ xfs_bmap_add_extent_delay_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - int rsvd) /* OK to use reserved data block allocation */ + int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ int diff; /* temp value */ @@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real( */ cur = *curp; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); @@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, idx, 2, state); - ip->i_df.if_lastex = idx - 1; + xfs_iext_remove(ip, *idx + 1, 2, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx - 1; - xfs_iext_remove(ip, idx, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real( * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx; - xfs_iext_remove(ip, idx + 1, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, PREV.br_state))) goto done; } + *dnew = 0; break; @@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } + *dnew = 0; break; @@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - ip->i_df.if_lastex = idx - 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; *dnew = temp; break; @@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real( * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, idx, 1, new, state); - ip->i_df.if_lastex = idx; + xfs_iext_insert(ip, *idx, 1, new, state); ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, idx + 1); + ep = xfs_iext_get_ext(ifp, *idx + 1); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); + *dnew = temp; break; @@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); - ip->i_df.if_lastex = idx + 1; + trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_state))) goto done; } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; *dnew = temp; break; @@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_iext_insert(ip, idx + 1, 1, new, state); - ip->i_df.if_lastex = idx + 1; + xfs_iext_insert(ip, *idx + 1, 1, new, state); ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; *dnew = temp; break; @@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real( */ temp = new->br_startoff - PREV.br_startoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ LEFT = *new; RIGHT.br_state = PREV.br_state; @@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ - xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); - ip->i_df.if_lastex = idx + 1; + xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state); ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), rsvd)) { + -((int64_t)diff), 0)) { /* * Ick gross gag me with a spoon. */ @@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real( if (!diff || !xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), rsvd)) + -((int64_t)diff), 0)) break; } if (temp2) { @@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real( if (!diff || !xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - -((int64_t)diff), rsvd)) + -((int64_t)diff), 0)) break; } } } - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2), nullstartblock((int)temp2)); - trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_); + + ++*idx; *dnew = temp + temp2; break; @@ -1161,7 +1131,7 @@ done: STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp) /* inode logging flags */ @@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real( error = 0; cur = *curp; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; oldext = (newext == XFS_EXT_UNWRITTEN) ? @@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real( * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, idx, 2, state); - ip->i_df.if_lastex = idx - 1; + xfs_iext_remove(ip, *idx + 1, 2, state); ip->i_d.di_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx - 1; - xfs_iext_remove(ip, idx, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real( * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx; - xfs_iext_remove(ip, idx + 1, 1, state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real( * Neither the left nor right neighbors are contiguous with * the new one. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_state(ep, newext); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; - ip->i_df.if_lastex = idx - 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real( * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, idx, 1, new, state); - ip->i_df.if_lastex = idx; + xfs_iext_insert(ip, *idx, 1, new, state); ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); - trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real( * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); - xfs_iext_insert(ip, idx + 1, 1, new, state); - ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); r[0] = *new; r[1].br_startoff = new_endoff; @@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - xfs_iext_insert(ip, idx + 1, 2, &r[0], state); - ip->i_df.if_lastex = idx + 1; + + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + ip->i_d.di_nextents += 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1650,12 +1625,10 @@ done: STATIC int /* error */ xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - int rsvd) /* OK to allocate reserved blocks */ + int *logflagsp) /* inode logging flags */ { - xfs_bmbt_rec_host_t *ep; /* extent record for idx */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ @@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t temp=0; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); state = 0; ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ - if (idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; @@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(ep, &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; @@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay( * on the left and on the right. * Merge all three into a single extent record. */ + --*idx; temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(ip, idx, 1, state); - ip->i_df.if_lastex = idx - 1; + xfs_iext_remove(ip, *idx + 1, 1, state); break; case BMAP_LEFT_CONTIG: @@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay( * on the left. * Merge the new allocation with the left neighbor. */ + --*idx; temp = left.br_blockcount + new->br_blockcount; - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); - - ip->i_df.if_lastex = idx - 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_allf(ep, new->br_startoff, + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - - ip->i_df.if_lastex = idx; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_iext_insert(ip, idx, 1, new, state); - ip->i_df.if_lastex = idx; + xfs_iext_insert(ip, *idx, 1, new, state); break; } if (oldlen != newlen) { ASSERT(oldlen > newlen); xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), rsvd); + (int64_t)(oldlen - newlen), 0); /* * Nothing to do for disk quota accounting here. */ @@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay( STATIC int /* error */ xfs_bmap_add_extent_hole_real( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ int error; /* error return value */ int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ @@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - ep = xfs_iext_get_ext(ifp, idx); + ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); state = 0; if (whichfork == XFS_ATTR_FORK) @@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(ep, &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); - xfs_iext_remove(ip, idx, 1, state); - ifp->if_lastex = idx - 1; XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { @@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ifp->if_lastex = idx - 1; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { @@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); - xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - ifp->if_lastex = idx; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { @@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(ip, idx, 1, new, state); - ifp->if_lastex = idx; + xfs_iext_insert(ip, *idx, 1, new, state); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { @@ -2833,13 +2801,12 @@ STATIC int /* error */ xfs_bmap_del_extent( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t idx, /* extent number to update/delete */ + xfs_extnum_t *idx, /* extent number to update/delete */ xfs_bmap_free_t *flist, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - int rsvd) /* OK to allocate reserved blocks */ + int whichfork) /* data or attr fork */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -2870,10 +2837,10 @@ xfs_bmap_del_extent( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((idx >= 0) && (idx < ifp->if_bytes / + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; @@ -2947,11 +2914,12 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - xfs_iext_remove(ip, idx, 1, + xfs_iext_remove(ip, *idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - ifp->if_lastex = idx; + --*idx; if (delay) break; + XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; @@ -2968,21 +2936,20 @@ xfs_bmap_del_extent( /* * Deleting the first part of the extent. */ - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -2998,18 +2965,17 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); da_new = temp; break; } - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -3026,7 +2992,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3113,9 +3079,9 @@ xfs_bmap_del_extent( } } } - trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); - xfs_iext_insert(ip, idx + 1, 1, &new, state); - ifp->if_lastex = idx + 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; break; } /* @@ -3142,7 +3108,7 @@ xfs_bmap_del_extent( ASSERT(da_old >= da_new); if (da_old > da_new) { xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), rsvd); + (int64_t)(da_old - da_new), 0); } done: *logflagsp = flags; @@ -4562,29 +4528,24 @@ xfs_bmapi( if (rt) { error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), (flags & - XFS_BMAPI_RSVBLOCKS)); + -((int64_t)extsz), 0); } else { error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), (flags & - XFS_BMAPI_RSVBLOCKS)); + -((int64_t)alen), 0); } if (!error) { error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), (flags & - XFS_BMAPI_RSVBLOCKS)); + -((int64_t)indlen), 0); if (error && rt) xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)extsz, (flags & - XFS_BMAPI_RSVBLOCKS)); + (int64_t)extsz, 0); else if (error) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)alen, (flags & - XFS_BMAPI_RSVBLOCKS)); + (int64_t)alen, 0); } if (error) { @@ -4701,13 +4662,12 @@ xfs_bmapi( if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) got.br_state = XFS_EXT_UNWRITTEN; } - error = xfs_bmap_add_extent(ip, lastx, &cur, &got, + error = xfs_bmap_add_extent(ip, &lastx, &cur, &got, firstblock, flist, &tmp_logflags, - whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); + whichfork); logflags |= tmp_logflags; if (error) goto error0; - lastx = ifp->if_lastex; ep = xfs_iext_get_ext(ifp, lastx); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); @@ -4803,13 +4763,12 @@ xfs_bmapi( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx, &cur, mval, + error = xfs_bmap_add_extent(ip, &lastx, &cur, mval, firstblock, flist, &tmp_logflags, - whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); + whichfork); logflags |= tmp_logflags; if (error) goto error0; - lastx = ifp->if_lastex; ep = xfs_iext_get_ext(ifp, lastx); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(ep, &got); @@ -4868,14 +4827,14 @@ xfs_bmapi( /* * Else go on to the next record. */ - ep = xfs_iext_get_ext(ifp, ++lastx); prev = got; - if (lastx >= nextents) - eof = 1; - else + if (++lastx < nextents) { + ep = xfs_iext_get_ext(ifp, lastx); xfs_bmbt_get_all(ep, &got); + } else { + eof = 1; + } } - ifp->if_lastex = lastx; *nmap = n; /* * Transform from btree to extents, give it cur. @@ -4984,7 +4943,6 @@ xfs_bmapi_single( ASSERT(!isnullstartblock(got.br_startblock)); ASSERT(bno < got.br_startoff + got.br_blockcount); *fsb = got.br_startblock + (bno - got.br_startoff); - ifp->if_lastex = lastx; return 0; } @@ -5026,7 +4984,6 @@ xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ - int rsvd; /* OK to allocate reserved blocks */ xfs_fsblock_t sum; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5044,7 +5001,7 @@ xfs_bunmapi( mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + ASSERT(len > 0); ASSERT(nexts >= 0); ASSERT(ifp->if_ext_max == @@ -5160,9 +5117,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx, &cur, &del, + error = xfs_bmap_add_extent(ip, &lastx, &cur, &del, firstblock, flist, &logflags, - XFS_DATA_FORK, 0); + XFS_DATA_FORK); if (error) goto error0; goto nodelete; @@ -5188,9 +5145,12 @@ xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (bno < got.br_startoff) { - if (--lastx >= 0) - xfs_bmbt_get_all(--ep, &got); + if (got.br_startoff > bno) { + if (--lastx >= 0) { + ep = xfs_iext_get_ext(ifp, + lastx); + xfs_bmbt_get_all(ep, &got); + } } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { @@ -5214,18 +5174,19 @@ xfs_bunmapi( prev.br_startoff = start; } prev.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx - 1, &cur, + lastx--; + error = xfs_bmap_add_extent(ip, &lastx, &cur, &prev, firstblock, flist, &logflags, - XFS_DATA_FORK, 0); + XFS_DATA_FORK); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx, &cur, + error = xfs_bmap_add_extent(ip, &lastx, &cur, &del, firstblock, flist, &logflags, - XFS_DATA_FORK, 0); + XFS_DATA_FORK); if (error) goto error0; goto nodelete; @@ -5240,13 +5201,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, rsvd); + (int64_t)rtexts, 0); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, rsvd); + (int64_t)del.br_blockcount, 0); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5277,31 +5238,29 @@ xfs_bunmapi( error = XFS_ERROR(ENOSPC); goto error0; } - error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, - &tmp_logflags, whichfork, rsvd); + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) goto error0; bno = del.br_startoff - 1; nodelete: - lastx = ifp->if_lastex; /* * If not done go on to the next (previous) record. - * Reset ep in case the extents array was re-alloced. */ - ep = xfs_iext_get_ext(ifp, lastx); if (bno != (xfs_fileoff_t)-1 && bno >= start) { - if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || - xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, lastx); - } - if (lastx >= 0) + if (lastx >= 0) { + ep = xfs_iext_get_ext(ifp, lastx); + if (xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, + lastx); + } xfs_bmbt_get_all(ep, &got); + } extno++; } } - ifp->if_lastex = lastx; *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 3651191daea..c62234bde05 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -69,7 +69,6 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ -#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */ #define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ #define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ /* combine contig. space */ @@ -87,7 +86,6 @@ typedef struct xfs_bmap_free { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ - { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index be628677c28..9a84a85c03b 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -202,7 +202,7 @@ xfs_swap_extents( xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { - xfs_mount_t *mp; + xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; @@ -212,16 +212,12 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; - mp = ip->i_mount; - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); goto out; } - sbp = &sxp->sx_stat; - /* * we have to do two separate lock calls here to keep lockdep * happy. If we try to get all the locks in one call, lock will diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a37480a6e02..a098a20ca63 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -920,7 +920,6 @@ xfs_iread_extents( /* * We know that the size is valid (it's checked in iformat_btree) */ - ifp->if_lastex = NULLEXTNUM; ifp->if_bytes = ifp->if_real_bytes = 0; ifp->if_flags |= XFS_IFEXTENTS; xfs_iext_add(ifp, 0, nextents); @@ -1354,7 +1353,7 @@ xfs_itruncate_start( return 0; } last_byte = xfs_file_last_byte(ip); - trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); + trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte); if (last_byte > toss_start) { if (flags & XFS_ITRUNC_DEFINITE) { xfs_tosspages(ip, toss_start, @@ -1470,7 +1469,7 @@ xfs_itruncate_finish( * file but the log buffers containing the free and reallocation * don't, then we'd end up with garbage in the blocks being freed. * As long as we make the new_size permanent before actually - * freeing any blocks it doesn't matter if they get writtten to. + * freeing any blocks it doesn't matter if they get written to. * * The callers must signal into us whether or not the size * setting here must be synchronous. There are a few cases @@ -2558,12 +2557,9 @@ xfs_iflush_fork( case XFS_DINODE_FMT_EXTENTS: ASSERT((ifp->if_flags & XFS_IFEXTENTS) || !(iip->ili_format.ilf_fields & extflag[whichfork])); - ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || - (ifp->if_bytes == 0)); - ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || - (ifp->if_bytes > 0)); if ((iip->ili_format.ilf_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); @@ -3112,6 +3108,8 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; } else if (ifp->if_flags & XFS_IFEXTIREC) { @@ -3191,7 +3189,6 @@ xfs_iext_add( } ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; ifp->if_real_bytes = 0; - ifp->if_lastex = nextents + ext_diff; } /* * Otherwise use a linear (direct) extent list. @@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec( xfs_extnum_t page_idx = *idxp; /* extent index in target list */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0 && page_idx <= - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; low = 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ff4e2a30227..3ae6d58e547 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -67,7 +67,6 @@ typedef struct xfs_ifork { short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ unsigned char if_ext_max; /* max # of extent records */ - xfs_extnum_t if_lastex; /* last if_extents used */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 576fdfe81d6..09983a3344a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -970,7 +970,6 @@ xfs_iflush_abort( { xfs_inode_log_item_t *iip = ip->i_itemp; - iip = ip->i_itemp; if (iip) { struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b612ce4520a..211930246f2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log) xlog_cil_destroy(log); + /* + * always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. + */ + xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_free(log->l_xbuf); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); @@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log) } spinlock_destroy(&log->l_icloglock); - xfs_buf_free(log->l_xbuf); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ @@ -3248,13 +3254,6 @@ xfs_log_ticket_get( return ticket; } -xlog_tid_t -xfs_log_get_trans_ident( - struct xfs_trans *tp) -{ - return tp->t_ticket->t_tid; -} - /* * Allocate and initialise a new log ticket. */ diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3bd3291ef8d..78c9039994a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -189,8 +189,6 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); - void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9ca59be0897..c7755d5a5fb 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -29,6 +29,7 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_discard.h" /* * Perform initial CIL structure initialisation. If the CIL is not @@ -361,19 +362,28 @@ xlog_cil_committed( int abort) { struct xfs_cil_ctx *ctx = args; - struct xfs_busy_extent *busyp, *n; + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) - xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + xfs_alloc_busy_sort(&ctx->busy_extents); + xfs_alloc_busy_clear(mp, &ctx->busy_extents, + (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_cil_lock); xlog_cil_free_logvec(ctx->lv_chain); + + if (!list_empty(&ctx->busy_extents)) { + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + xfs_discard_extents(mp, &ctx->busy_extents); + xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + } + kmem_free(ctx); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 5864850e9e3..2d3b6a498d6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i) shutdown */ #define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ +typedef __uint32_t xlog_tid_t; + #ifdef __KERNEL__ /* * Below are states for covering allocation transactions. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5cc464a17c9..04142caedb2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -205,6 +205,35 @@ xlog_bread( } /* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + xlog_t *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + xfs_buf_t *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + int orig_len = bp->b_buffer_length; + int error, error2; + + error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. * This can only be used for synchronous log writes. @@ -1229,20 +1258,12 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp); - balign = BBTOB(ealign - start_block); - error = XFS_BUF_SET_PTR(bp, offset + balign, - BBTOB(sectbb)); + offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); if (error) break; - error = xlog_bread_noalign(log, ealign, sectbb, bp); - if (error) - break; - - error = XFS_BUF_SET_PTR(bp, offset, bufblks); - if (error) - break; } offset = xlog_align(log, start_block, endcount, bp); @@ -3448,19 +3469,9 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - error = XFS_BUF_SET_PTR(hbp, - offset + BBTOB(split_hblks), - BBTOB(hblks - split_hblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, 0, - wrapped_hblks, hbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(hbp, offset, - BBTOB(hblks)); + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); if (error) goto bread_err2; } @@ -3511,19 +3522,9 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - error = XFS_BUF_SET_PTR(dbp, - offset + BBTOB(split_bblks), - BBTOB(bblks - split_bblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(dbp, offset, h_size); + error = xlog_bread_offset(log, 0, + bblks - split_bblks, hbp, + offset + BBTOB(split_bblks)); if (error) goto bread_err2; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb3f9a7b24e..b49b82363d2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch( uint nmsb, int rsvd) { - xfs_mod_sb_t *msbp = &msb[0]; + xfs_mod_sb_t *msbp; int error = 0; /* @@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch( * changes will be atomic. */ spin_lock(&mp->m_sb_lock); - for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { + for (msbp = msb; msbp < (msb + nmsb); msbp++) { ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || msbp->msb_field > XFS_SBS_FDBLOCKS); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 19af0ab0d0c..3d68bb267c5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -224,6 +224,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ +#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ #define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 76922793f64..7c7bc2b786b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -608,10 +608,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - struct xfs_busy_extent *busyp, *n; - - list_for_each_entry_safe(busyp, n, &tp->t_busy, list) - xfs_alloc_busy_clear(tp->t_mountp, busyp); + xfs_alloc_busy_sort(&tp->t_busy); + xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 26d1867d815..65584b55607 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */ typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint32_t xlog_tid_t; /* transaction ID type */ - /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |