diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/Makefile | 2 | ||||
-rw-r--r-- | fs/ext4/acl.c | 11 | ||||
-rw-r--r-- | fs/ext4/acl.h | 2 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 8 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 7 | ||||
-rw-r--r-- | fs/ext4/dir.c | 58 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 208 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 73 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 2 | ||||
-rw-r--r-- | fs/ext4/extents.c | 455 | ||||
-rw-r--r-- | fs/ext4/file.c | 68 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 90 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 137 | ||||
-rw-r--r-- | fs/ext4/inode.c | 683 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 24 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 598 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 4 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 22 | ||||
-rw-r--r-- | fs/ext4/namei.c | 134 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 428 | ||||
-rw-r--r-- | fs/ext4/resize.c | 119 | ||||
-rw-r--r-- | fs/ext4/super.c | 971 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 32 | ||||
-rw-r--r-- | fs/ext4/xattr.h | 10 |
24 files changed, 2775 insertions, 1371 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8867b2a1e5f..c947e36eda6 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 5e2ed4504ea..e0270d1f8d8 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_check_acl(struct inode *inode, int mask) +ext4_check_acl(struct inode *inode, int mask, unsigned int flags) { - struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + struct posix_acl *acl; + + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac..dec821168fd 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +extern int ext4_check_acl(struct inode *, int, unsigned int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd30799a43e..adf96b82278 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -171,7 +171,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * less than the blocksize * 8 ( which is the size * of bitmap ), set rest of the block bitmap to 1 */ - mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); + ext4_mark_bitmap_end(group_blocks, sb->s_blocksize * 8, + bh->b_data); } return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); } @@ -489,7 +490,7 @@ error_return: * Check if filesystem has nblocks free & available for allocation. * On success return 1, return 0 on failure. */ -int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) +static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) { s64 free_blocks, dirty_blocks, root_blocks; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; @@ -591,7 +592,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { + if (!(*errp) && + ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3db5084db9b..fac90f3fba8 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -29,16 +29,15 @@ struct ext4_system_zone { static struct kmem_cache *ext4_system_zone_cachep; -int __init init_ext4_system_zone(void) +int __init ext4_init_system_zone(void) { - ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, - SLAB_RECLAIM_ACCOUNT); + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); if (ext4_system_zone_cachep == NULL) return -ENOMEM; return 0; } -void exit_ext4_system_zone(void) +void ext4_exit_system_zone(void) { kmem_cache_destroy(ext4_system_zone_cachep); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 374510f72ba..164c56092e5 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -39,7 +39,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp); const struct file_operations ext4_dir_operations = { - .llseek = generic_file_llseek, + .llseek = ext4_llseek, .read = generic_read_dir, .readdir = ext4_readdir, /* we take BKL. needed?*/ .unlocked_ioctl = ext4_ioctl, @@ -60,9 +60,13 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext4_filetype_table[filetype]); } - +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + */ int __ext4_check_dir_entry(const char *function, unsigned int line, - struct inode *dir, + struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, unsigned int offset) @@ -71,26 +75,37 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (rlen < EXT4_DIR_REC_LEN(1)) + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) error_msg = "directory entry across blocks"; - else if (le32_to_cpu(de->inode) > - le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else + return 0; - if (error_msg != NULL) - ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - " - "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), offset, - le32_to_cpu(de->inode), - rlen, de->name_len); - return error_msg == NULL ? 1 : 0; + if (filp) + ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset%bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + + return 1; } static int ext4_readdir(struct file *filp, @@ -152,8 +167,9 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_INODE(inode, "directory " - "contains a hole at offset %Lu", + EXT4_ERROR_FILE(filp, 0, + "directory contains a " + "hole at offset %llu", (unsigned long long) filp->f_pos); dir_has_error = 1; } @@ -194,8 +210,8 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry(inode, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, filp, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 889ec9d5e6a..0c8d97b56f3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -62,8 +62,8 @@ #define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) -#define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -168,7 +168,20 @@ struct mpage_da_data { int pages_written; int retval; }; -#define EXT4_IO_UNWRITTEN 0x1 + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_ERROR 0x0002 + +struct ext4_io_page { + struct page *p_page; + atomic_t p_count; +}; + +#define MAX_IO_PAGES 128 + typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ @@ -179,8 +192,18 @@ typedef struct ext4_io_end { struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ + int num_io_pages; + struct ext4_io_page *pages[MAX_IO_PAGES]; } ext4_io_end_t; +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + struct ext4_io_page *io_page; + sector_t io_next_block; +}; + /* * Special inodes numbers */ @@ -205,6 +228,7 @@ typedef struct ext4_io_end { #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else @@ -537,23 +561,7 @@ struct ext4_new_group_data { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif - -/* - * Mount options - */ -struct ext4_mount_options { - unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; - unsigned long s_commit_interval; - u32 s_min_batch_time, s_max_batch_time; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; -#endif -}; - -/* Max physical block we can addres w/o extents */ +/* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* @@ -685,6 +693,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ ext4_decode_extra_time(&(inode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ @@ -695,6 +705,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ } while (0) #define i_disk_version osd1.linux1.l_i_version @@ -726,12 +738,13 @@ do { \ /* * storage for cached extent + * If ec_len == 0, then the cache is invalid. + * If ec_start == 0, then the cache represents a gap (null mapping) */ struct ext4_ext_cache { ext4_fsblk_t ec_start; ext4_lblk_t ec_block; __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; }; /* @@ -750,10 +763,12 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ +#endif unsigned long i_flags; - ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file @@ -796,7 +811,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; - struct jbd2_inode jinode; + struct jbd2_inode *jinode; struct ext4_ext_cache i_cached_extent; /* @@ -816,14 +831,12 @@ struct ext4_inode_info { unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - sector_t i_da_metadata_calc_last_lblock; + ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; /* on-disk additional length */ __u16 i_extra_isize; - spinlock_t i_block_reservation_lock; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -832,9 +845,12 @@ struct ext4_inode_info { /* completed IOs that might need unwritten extents handling */ struct list_head i_completed_io_list; spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + spinlock_t i_block_reservation_lock; + /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. @@ -885,16 +901,27 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ -#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt -#define set_opt(o, opt) o |= EXT4_MOUNT_##opt +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + #define ext4_set_bit ext2_set_bit #define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_clear_bit ext2_clear_bit @@ -1060,6 +1087,7 @@ struct ext4_sb_info { struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ struct buffer_head **s_group_desc; unsigned int s_mount_opt; + unsigned int s_mount_opt2; unsigned int s_mount_flags; ext4_fsblk_t s_sb_block; uid_t s_resuid; @@ -1087,7 +1115,6 @@ struct ext4_sb_info { struct completion s_kobj_unregister; /* Journaling */ - struct inode *s_journal_inode; struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; @@ -1120,10 +1147,7 @@ struct ext4_sb_info { /* for buddy allocator */ struct ext4_group_info ***s_group_info; struct inode *s_buddy_cache; - long s_blocks_reserved; - spinlock_t s_reserve_lock; spinlock_t s_md_lock; - tid_t s_last_transaction; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; @@ -1141,7 +1165,6 @@ struct ext4_sb_info { unsigned long s_mb_last_start; /* stats for buddy allocator */ - spinlock_t s_mb_pa_lock; atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ @@ -1172,6 +1195,11 @@ struct ext4_sb_info { /* timer for periodic error stats printing */ struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1210,24 +1238,39 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ }; -#define EXT4_INODE_BIT_FNS(name, field) \ +#define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ - return test_bit(bit, &EXT4_I(inode)->i_##field); \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ - set_bit(bit, &EXT4_I(inode)->i_##field); \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ - clear_bit(bit, &EXT4_I(inode)->i_##field); \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } -EXT4_INODE_BIT_FNS(flag, flags) -EXT4_INODE_BIT_FNS(state, state_flags) +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1533,7 +1576,42 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); -extern struct proc_dir_entry *ext4_proc_root; +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + + wait_queue_head_t li_wait_daemon; + wait_queue_head_t li_wait_task; + struct timer_list li_timer; + struct task_struct *li_task; + + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; /* * Function prototypes @@ -1561,7 +1639,6 @@ extern unsigned long ext4_bg_num_gdb(struct super_block *sb, extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); @@ -1581,10 +1658,12 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, de, bh, offset) \ - __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1592,6 +1671,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ extern int ext4_sync_file(struct file *, int); +extern int ext4_flush_completed_IO(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -1605,11 +1685,9 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); /* mballoc.c */ extern long ext4_mb_stats; @@ -1620,16 +1698,15 @@ extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); -extern int __init init_ext4_mballoc(void); -extern void exit_ext4_mballoc(void); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); -extern void ext4_mb_put_buddy_cache_lock(struct super_block *, - ext4_group_t, int); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + /* inode.c */ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); @@ -1657,13 +1734,11 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); /* ioctl.c */ @@ -1696,8 +1771,8 @@ extern void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...) __attribute__ ((format (printf, 5, 6))); extern void ext4_error_file(struct file *, const char *, unsigned int, - const char *, ...) - __attribute__ ((format (printf, 4, 5))); + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern void __ext4_abort(struct super_block *, const char *, unsigned int, @@ -1960,6 +2035,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; @@ -1973,8 +2049,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); -extern int __init init_ext4_system_zone(void); -extern void exit_ext4_system_zone(void); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); @@ -1989,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); -extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); @@ -2002,6 +2078,18 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern int ext4_end_io_nolock(ext4_io_end_t *io); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc); /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bdb6ce7e2eb..2e29abb30f7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -119,10 +119,6 @@ struct ext4_ext_path { * structure for external API */ -#define EXT4_EXT_CACHE_NO 0 -#define EXT4_EXT_CACHE_GAP 1 -#define EXT4_EXT_CACHE_EXTENT 2 - /* * to be called by ext4_ext_walk_space() * negative retcode - error @@ -197,7 +193,7 @@ static inline unsigned short ext_depth(struct inode *inode) static inline void ext4_ext_invalidate_cache(struct inode *inode) { - EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; + EXT4_I(inode)->i_cached_extent.ec_len = 0; } static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) @@ -225,11 +221,60 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + extern int ext4_ext_calc_metadata_amount(struct inode *inode, - sector_t lblocks); -extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); -extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); -extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); + ext4_lblk_t lblocks); extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, @@ -237,19 +282,9 @@ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, extern int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2); -extern int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *); -extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, - ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); -extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, - ext4_lblk_t *, ext4_fsblk_t *); -extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, - ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b0bd792c58c..d8b992e658c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -253,7 +253,7 @@ static inline int ext4_journal_force_commit(journal_t *journal) static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 06328d3e571..63a75810b7c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,55 +44,6 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" - -/* - * ext_pblock: - * combine low and high parts of physical block number into ext4_fsblk_t - */ -ext4_fsblk_t ext_pblock(struct ext4_extent *ex) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ex->ee_start_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; - return block; -} - -/* - * idx_pblock: - * combine low and high parts of a leaf physical block number into ext4_fsblk_t - */ -ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ix->ei_leaf_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; - return block; -} - -/* - * ext4_ext_store_pblock: - * stores a large physical block number into an extent struct, - * breaking it into parts - */ -void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) -{ - ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); -} - -/* - * ext4_idx_store_pblock: - * stores a large physical block number into an index struct, - * breaking it into parts - */ -static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) -{ - ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); -} - static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -166,10 +117,33 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_extent *ex; depth = path->p_depth; - /* try to predict block placement */ + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especiially if the latter case turns out to be + * common. + */ ex = path[depth].p_ext; - if (ex) - return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } /* it looks like index is empty; * try to find starting block from index itself */ @@ -292,7 +266,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); int idxs, num = 0; @@ -354,7 +328,7 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext); + ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); @@ -363,7 +337,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx); + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } @@ -463,13 +437,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - idx_pblock(path->p_idx)); + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext), - ext_pblock(path->p_ext)); + ext4_ext_pblock(path->p_ext)); } else ext_debug(" []"); } @@ -494,7 +468,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); } @@ -545,7 +519,7 @@ ext4_ext_binsearch_idx(struct inode *inode, path->p_idx = l - 1; ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), - idx_pblock(path->p_idx)); + ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { @@ -614,7 +588,7 @@ ext4_ext_binsearch(struct inode *inode, path->p_ext = l - 1; ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), - ext_pblock(path->p_ext), + ext4_ext_pblock(path->p_ext), ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); @@ -682,7 +656,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); - path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; @@ -721,7 +695,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) - path[ppos].p_block = ext_pblock(path[ppos].p_ext); + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); @@ -739,9 +713,9 @@ err: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ -int ext4_ext_insert_index(handle_t *handle, struct inode *inode, - struct ext4_ext_path *curp, - int logical, ext4_fsblk_t ptr) +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; @@ -917,7 +891,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), - ext_pblock(path[depth].p_ext), + ext4_ext_pblock(path[depth].p_ext), ext4_ext_is_uninitialized(path[depth].p_ext), ext4_ext_get_actual_len(path[depth].p_ext), newblock); @@ -1007,7 +981,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), - idx_pblock(path[i].p_idx), + ext4_idx_pblock(path[i].p_idx), newblock); /*memmove(++fidx, path[i].p_idx++, sizeof(struct ext4_extent_idx)); @@ -1146,7 +1120,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), - idx_pblock(EXT_FIRST_INDEX(neh))); + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); neh->eh_depth = cpu_to_le16(path->p_depth + 1); err = ext4_ext_dirty(handle, inode, curp); @@ -1232,9 +1206,9 @@ out: * returns 0 at @phys * return value contains 0 (success) or error code */ -int -ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; @@ -1286,7 +1260,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; - *phys = ext_pblock(ex) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } @@ -1297,9 +1271,9 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, * returns 0 at @phys * return value contains 0 (success) or error code */ -int -ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1342,7 +1316,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, } } *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); return 0; } @@ -1357,7 +1331,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, /* next allocated block in this leaf */ ex++; *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); return 0; } @@ -1376,7 +1350,7 @@ got_index: * follow it and find the closest allocated * block to the right */ ix++; - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { bh = sb_bread(inode->i_sb, block); if (bh == NULL) @@ -1388,7 +1362,7 @@ got_index: return -EIO; } ix = EXT_FIRST_INDEX(eh); - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); put_bh(bh); } @@ -1402,7 +1376,7 @@ got_index: } ex = EXT_FIRST_EXTENT(eh); *logical = le32_to_cpu(ex->ee_block); - *phys = ext_pblock(ex); + *phys = ext4_ext_pblock(ex); put_bh(bh); return 0; } @@ -1573,7 +1547,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, return 0; #endif - if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } @@ -1585,9 +1559,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ -int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex) +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; @@ -1632,9 +1606,9 @@ int ext4_ext_try_to_merge(struct inode *inode, * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ -unsigned int ext4_ext_check_overlap(struct inode *inode, - struct ext4_extent *newext, - struct ext4_ext_path *path) +static unsigned int ext4_ext_check_overlap(struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; @@ -1706,11 +1680,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), ext_pblock(ex)); + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; @@ -1780,7 +1755,7 @@ has_space: /* there is no extent in this leaf, create first one */ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); @@ -1794,7 +1769,7 @@ has_space: ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); @@ -1808,7 +1783,7 @@ has_space: ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), - ext_pblock(newext), + ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); @@ -1819,7 +1794,7 @@ has_space: le16_add_cpu(&eh->eh_entries, 1); nearex = path[depth].p_ext; nearex->ee_block = newext->ee_block; - ext4_ext_store_pblock(nearex, ext_pblock(newext)); + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: @@ -1845,9 +1820,9 @@ cleanup: return err; } -int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) { struct ext4_ext_path *path = NULL; struct ext4_ext_cache cbex; @@ -1919,12 +1894,10 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_block = start; cbex.ec_len = end - start; cbex.ec_start = 0; - cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext_pblock(ex); - cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + cbex.ec_start = ext4_ext_pblock(ex); } if (unlikely(cbex.ec_len == 0)) { @@ -1964,13 +1937,12 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start, int type) + __u32 len, ext4_fsblk_t start) { struct ext4_ext_cache *cex; BUG_ON(len == 0); spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_type = type; cex->ec_block = block; cex->ec_len = len; cex->ec_start = start; @@ -2023,15 +1995,18 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); + ext4_ext_put_in_cache(inode, lblock, len, 0); } +/* + * Return 0 if cache is invalid; 1 if the cache is valid + */ static int ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - int ret = EXT4_EXT_CACHE_NO; + int ret = 0; /* * We borrow i_block_reservation_lock to protect i_cached_extent @@ -2040,11 +2015,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, cex = &EXT4_I(inode)->i_cached_extent; /* has cache valid data? */ - if (cex->ec_type == EXT4_EXT_CACHE_NO) + if (cex->ec_len == 0) goto errout; - BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && - cex->ec_type != EXT4_EXT_CACHE_EXTENT); if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); @@ -2052,7 +2025,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); - ret = cex->ec_type; + ret = 1; } errout: spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2073,7 +2046,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, /* free index block */ path--; - leaf = idx_pblock(path->p_idx); + leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EIO; @@ -2181,7 +2154,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t start; num = le32_to_cpu(ex->ee_block) + ee_len - from; - start = ext_pblock(ex) + ee_len - num; + start = ext4_ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) @@ -2310,7 +2283,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, goto out; ext_debug("new extent: %u:%u:%llu\n", block, num, - ext_pblock(ex)); + ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2421,9 +2394,9 @@ again: struct buffer_head *bh; /* go to the next level */ ext_debug("move to level %d (block %llu)\n", - i + 1, idx_pblock(path[i].p_idx)); + i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, idx_pblock(path[i].p_idx)); + bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); if (!bh) { /* should we reset i_size? */ err = -EIO; @@ -2535,77 +2508,21 @@ void ext4_ext_release(struct super_block *sb) #endif } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { + ext4_fsblk_t ee_pblock; + unsigned int ee_len; int ret; - struct bio *bio; - int blkbits, blocksize; - sector_t ee_pblock; - struct completion event; - unsigned int ee_len, len, done, offset; - - blkbits = inode->i_blkbits; - blocksize = inode->i_sb->s_blocksize; ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext_pblock(ex); - - /* convert ee_pblock to 512 byte sectors */ - ee_pblock = ee_pblock << (blkbits - 9); - - while (ee_len > 0) { - - if (ee_len > BIO_MAX_PAGES) - len = BIO_MAX_PAGES; - else - len = ee_len; - - bio = bio_alloc(GFP_NOIO, len); - if (!bio) - return -ENOMEM; - - bio->bi_sector = ee_pblock; - bio->bi_bdev = inode->i_sb->s_bdev; - - done = 0; - offset = 0; - while (done < len) { - ret = bio_add_page(bio, ZERO_PAGE(0), - blocksize, offset); - if (ret != blocksize) { - /* - * We can't add any more pages because of - * hardware limitations. Start a new bio. - */ - break; - } - done++; - offset += blocksize; - if (offset >= PAGE_CACHE_SIZE) - offset = 0; - } + ee_pblock = ext4_ext_pblock(ex); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(WRITE, bio); - wait_for_completion(&event); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); + if (ret > 0) + ret = 0; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { - bio_put(bio); - return -EIO; - } - bio_put(bio); - ee_len -= done; - ee_pblock += done << (blkbits - 9); - } - return 0; + return ret; } #define EXT4_EXT_ZERO_LEN 7 @@ -2651,12 +2568,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext_pblock(ex); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); /* * It is safe to convert extent to initialized via explicit @@ -2675,7 +2592,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ return allocated; @@ -2710,7 +2627,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = orig_ex.ee_block; ex->ee_len = cpu_to_le16(ee_len - allocated); ext4_ext_mark_uninitialized(ex); - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); ex3 = &newex; @@ -2725,7 +2642,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto fix_extent_len; ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, + ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* blocks available from map->m_lblk */ return allocated; @@ -2782,7 +2700,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ /* blocks available from map->m_lblk */ @@ -2833,7 +2751,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ /* blocks available from map->m_lblk */ @@ -2902,7 +2820,7 @@ insert: /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ return allocated; @@ -2915,7 +2833,7 @@ out: fix_extent_len: ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_mark_uninitialized(ex); ext4_ext_dirty(handle, inode, path + depth); return err; @@ -2927,14 +2845,14 @@ fix_extent_len: * to an uninitialized extent. * * Writing to an uninitized extent may result in splitting the uninitialized - * extent into multiple /intialized unintialized extents (up to three) + * extent into multiple /initialized uninitialized extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be uninitialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * One of more index blocks maybe needed if the extent tree grow after - * the unintialized extent split. To prevent ENOSPC occur at the IO + * the uninitialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit * the IO. The uninitialized extent called at this time will be split * into three uninitialized extent(at most). After IO complete, the part @@ -2973,12 +2891,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); - newblock = map->m_lblk - ee_block + ext_pblock(ex); + newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); - ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); /* * It is safe to convert extent to initialized via explicit @@ -3027,7 +2945,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ /* blocks available from map->m_lblk */ @@ -3099,7 +3017,7 @@ insert: /* update the extent length and mark as initialized */ ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ return allocated; @@ -3112,7 +3030,7 @@ out: fix_extent_len: ex->ee_block = orig_ex.ee_block; ex->ee_len = orig_ex.ee_len; - ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); ext4_ext_mark_uninitialized(ex); ext4_ext_dirty(handle, inode, path + depth); return err; @@ -3180,6 +3098,57 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, unmap_underlying_metadata(bdev, block + i); } +/* + * Handle EOFBLOCKS_FL flag, clearing it if necessary + */ +static int check_eofblocks_fl(handle_t *handle, struct inode *inode, + ext4_lblk_t lblk, + struct ext4_ext_path *path, + unsigned int len) +{ + int i, depth; + struct ext4_extent_header *eh; + struct ext4_extent *ex, *last_ex; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + return 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " + "EOFBLOCKS_FL set"); + return -EIO; + } + last_ex = EXT_LAST_EXTENT(eh); + /* + * We should clear the EOFBLOCKS_FL flag if we are writing the + * last block in the last extent in the file. We test this by + * first checking to see if the caller to + * ext4_ext_get_blocks() was interested in the last block (or + * a block beyond the last block) in the current extent. If + * this turns out to be false, we can bail out from this + * function immediately. + */ + if (lblk + len < le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + return 0; + /* + * If the caller does appear to be planning to write at or + * beyond the end of the current extent, we then test to see + * if the current extent is the last extent in the file, by + * checking to make sure it was reached via the rightmost node + * at each level of the tree. + */ + for (i = depth-1; i >= 0; i--) + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + return 0; + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + return ext4_mark_inode_dirty(handle, inode); +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3206,7 +3175,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * completed */ if (io) - io->flag = EXT4_IO_UNWRITTEN; + io->flag = EXT4_IO_END_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) @@ -3217,8 +3186,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if ((flags & EXT4_GET_BLOCKS_CONVERT)) { ret = ext4_convert_unwritten_extents_endio(handle, inode, path); - if (ret >= 0) + if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; goto out2; } /* buffered IO case */ @@ -3244,8 +3217,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) + if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } + out: if (ret <= 0) { err = ret; @@ -3292,6 +3271,7 @@ out2: } return err ? err : allocated; } + /* * Block allocation/map/preallocation routine for extents based files * @@ -3315,9 +3295,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex, *last_ex; + struct ext4_extent newex, *ex; ext4_fsblk_t newblock; - int i, err = 0, depth, ret, cache_type; + int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; @@ -3326,9 +3306,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_lblk, map->m_len, inode->i_ino); /* check in cache */ - cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); - if (cache_type) { - if (cache_type == EXT4_EXT_CACHE_GAP) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and @@ -3337,17 +3316,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { + } else { /* block is already allocated */ newblock = map->m_lblk - le32_to_cpu(newex.ee_block) - + ext_pblock(&newex); + + ext4_ext_pblock(&newex); /* number of remaining blocks in the extent */ allocated = ext4_ext_get_actual_len(&newex) - (map->m_lblk - le32_to_cpu(newex.ee_block)); goto out; - } else { - BUG(); } } @@ -3379,7 +3356,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); - ext4_fsblk_t ee_start = ext_pblock(ex); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* @@ -3398,8 +3375,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* Do not put uninitialized extent in the cache */ if (!ext4_ext_is_uninitialized(ex)) { ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start, - EXT4_EXT_CACHE_EXTENT); + ee_len, ee_start); goto out; } ret = ext4_ext_handle_uninitialized_extents(handle, @@ -3488,7 +3464,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { if (io) - io->flag = EXT4_IO_UNWRITTEN; + io->flag = EXT4_IO_END_UNWRITTEN; else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); @@ -3497,44 +3473,23 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_flags |= EXT4_MAP_UNINIT; } - if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, - "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - err = -EIO; - goto out2; - } - last_ex = EXT_LAST_EXTENT(eh); - /* - * If the current leaf block was reached by looking at - * the last index block all the way down the tree, and - * we are extending the inode beyond the last extent - * in the current leaf block, then clear the - * EOFBLOCKS_FL flag. - */ - for (i = depth-1; i >= 0; i--) { - if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) - break; - } - if ((i < 0) && - (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + - ext4_ext_get_actual_len(last_ex))) - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - } + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); + if (err) + goto out2; + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; } /* previous routine could use block we allocated */ - newblock = ext_pblock(&newex); + newblock = ext4_ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); if (allocated > map->m_len) allocated = map->m_len; @@ -3552,8 +3507,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * when it is _not_ an uninitialized extent. */ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); ext4_update_inode_fsync_trans(handle, inode, 1); } else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -3581,6 +3535,12 @@ void ext4_ext_truncate(struct inode *inode) int err = 0; /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_completed_IO(inode); + + /* * probably first extent we're gonna free will be last in block */ err = ext4_writepage_trans_blocks(inode); @@ -3667,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode, } /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -3684,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* * currently supporting (pre)allocate mode for extent-based * files _only_ @@ -3691,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* preallocation to directories is currently not supported */ - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because @@ -3729,7 +3690,7 @@ retry: printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, - inode->i_ino, block, max_blocks); + inode->i_ino, map.m_lblk, max_blocks); #endif ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -3829,7 +3790,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, logical = (__u64)newex->ec_block << blksize_bits; - if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + if (newex->ec_start == 0) { pgoff_t offset; struct page *page; struct buffer_head *bh = NULL; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ee92b66d455..2e8322c8aa8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -104,6 +104,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -127,11 +128,74 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_mark_super_dirty(sb); } } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { + struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); + + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + } return dquot_file_open(inode, filp); } +/* + * ext4_llseek() copied from generic_file_llseek() to handle both + * block-mapped and extent-mapped maxbytes values. This should + * otherwise be identical with generic_file_llseek(). + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + else + maxbytes = inode->i_sb->s_maxbytes; + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + mutex_unlock(&inode->i_mutex); + return file->f_pos; + } + offset += file->f_pos; + break; + } + + if (offset < 0 || offset > maxbytes) { + mutex_unlock(&inode->i_mutex); + return -EINVAL; + } + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + + return offset; +} + const struct file_operations ext4_file_operations = { - .llseek = generic_file_llseek, + .llseek = ext4_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, @@ -146,6 +210,7 @@ const struct file_operations ext4_file_operations = { .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { @@ -159,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = { .removexattr = generic_removexattr, #endif .check_acl = ext4_check_acl, - .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 592adf2e546..7829b287822 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -34,6 +34,89 @@ #include <trace/events/ext4.h> +static void dump_completed_IO(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); +#endif +} + +/* + * This function is called from ext4_sync_file(). + * + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. + */ +extern int ext4_flush_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret = 0; + int ret2 = 0; + + if (list_empty(&ei->i_completed_io_list)) + return ret; + + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_io_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + return (ret2 < 0) ? ret2 : 0; +} + /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since @@ -86,7 +169,7 @@ int ext4_sync_file(struct file *file, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_completed_IO(inode); + ret = ext4_flush_completed_IO(inode); if (ret < 0) return ret; @@ -128,10 +211,9 @@ int ext4_sync_file(struct file *file, int datasync) (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, - NULL, BLKDEV_IFL_WAIT); + NULL); ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 45853e0d1f2..eb9097aec6f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -50,7 +50,7 @@ * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ -void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; @@ -65,9 +65,10 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) } /* Initializes an uninitialized inode bitmap */ -unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -85,7 +86,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); @@ -107,6 +108,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -123,6 +125,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); @@ -133,6 +136,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -411,8 +415,8 @@ struct orlov_stats { * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ -void get_orlov_stats(struct super_block *sb, ext4_group_t g, - int flex_size, struct orlov_stats *stats) +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; @@ -712,8 +716,17 @@ static int ext4_claim_inode(struct super_block *sb, { int free = 0, retval = 0, count; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + /* + * We have to be sure that new inode allocation does not race with + * inode table initialization, because otherwise we may end up + * allocating and writing new inode right before sb_issue_zeroout + * takes place and overwriting our new inode with zeroes. So we + * take alloc_sem to prevent it. + */ + down_read(&grp->alloc_sem); ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ @@ -724,6 +737,7 @@ static int ext4_claim_inode(struct super_block *sb, if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); ext4_error(sb, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); @@ -772,6 +786,7 @@ static int ext4_claim_inode(struct super_block *sb, gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: ext4_unlock_group(sb, group); + up_read(&grp->alloc_sem); return retval; } @@ -1012,7 +1027,7 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; @@ -1205,3 +1220,109 @@ unsigned long ext4_count_dirs(struct super_block * sb) } return count; } + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_claim_inode until we are finished. + */ +extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + + /* This should not happen, but just to be sure check this */ + if (sb->s_flags & MS_RDONLY) { + ret = 1; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); + + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + ext4_error(sb, "Something is wrong with group %u\n" + "Used itable blocks: %d" + "itable unused count: %u\n", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto out; + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, + group_desc_bh); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b8debeb396..9f7f9e49914 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,7 +39,9 @@ #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> +#include <linux/printk.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -53,13 +55,27 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate( - EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode, - new_size); + trace_ext4_begin_ordered_truncate(inode, new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); /* * Test whether an inode is a fast symlink. @@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, } /** - * ext4_blks_to_allocate: Look up the block map and count the number + * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks @@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, /** * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation * @indirect_blks: the number of blocks need to allocate for indirect * blocks - * + * @blks: number of desired blocks * @new_blocks: on return it will store the new block numbers for * the indirect blocks(if needed) and the first direct block, - * @blks: on return it will store the total number of allocated - * direct blocks + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, ext4_fsblk_t goal, @@ -703,9 +726,11 @@ failed_out: /** * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks + * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * @@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -813,6 +843,7 @@ failed: /** * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding * @chain: chain of indirect blocks (with a missing link - see @@ -1068,7 +1099,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) +static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); @@ -1207,8 +1238,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, break; idx++; num++; - if (num >= max_pages) + if (num >= max_pages) { + done = 1; break; + } } pagevec_release(&pvec); } @@ -1305,7 +1338,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * avoid double accounting */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 1; + ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -1335,7 +1368,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_da_update_reserve_space(inode, retval, 1); } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - EXT4_I(inode)->i_delalloc_reserved_flag = 0; + ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -1538,10 +1571,10 @@ static int do_journal_get_write_access(handle_t *handle, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_prepare_write() could have dirtied some buffers. Clean + * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear + * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -1863,7 +1896,7 @@ static int ext4_journalled_write_end(struct file *file, /* * Reserve a single block located at lblock */ -static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -1995,16 +2028,23 @@ static void ext4_da_page_release_reservation(struct page *page, * * As pages are already locked by write_cache_pages(), we can't use it */ -static int mpage_da_submit_io(struct mpage_da_data *mpd) +static int mpage_da_submit_io(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) { - long pages_skipped; struct pagevec pvec; unsigned long index, end; int ret = 0, err, nr_pages, i; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + loff_t size = i_size_read(inode); + unsigned int len, block_start; + struct buffer_head *bh, *page_bufs = NULL; + int journal_data = ext4_should_journal_data(inode); + sector_t pblock = 0, cur_logical = 0; + struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); + memset(&io_submit, 0, sizeof(io_submit)); /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -2020,122 +2060,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { + int commit_write = 0, redirty_page = 0; struct page *page = pvec.pages[i]; index = page->index; if (index > end) break; + + if (index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + if (map) { + cur_logical = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + pblock = map->m_pblk + (cur_logical - + map->m_lblk); + } index++; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - pages_skipped = mpd->wbc->pages_skipped; - err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err && (pages_skipped == mpd->wbc->pages_skipped)) - /* - * have successfully written the page - * without skipping the same - */ - mpd->pages_written++; /* - * In error case, we have to continue because - * remaining pages are still locked - * XXX: unlock and re-dirty them? + * If the page does not have buffers (for + * whatever reason), try to create them using + * __block_write_begin. If this fails, + * redirty the page and move on. */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - return ret; -} - -/* - * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers - * - * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten - */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - int blocks = map->m_len; - sector_t pblock = map->m_pblk, cur_logical; - struct buffer_head *head, *bh; - pgoff_t index, end; - struct pagevec pvec; - int nr_pages, i; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - - while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(!page_has_buffers(page)); - - bh = page_buffers(page); - head = bh; - - /* skip blocks out of the range */ - do { - if (cur_logical >= map->m_lblk) - break; - cur_logical++; - } while ((bh = bh->b_this_page) != head); + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: + redirty_page_for_writepage(mpd->wbc, + page); + unlock_page(page); + continue; + } + commit_write = 1; + } + bh = page_bufs = page_buffers(page); + block_start = 0; do { - if (cur_logical >= map->m_lblk + blocks) - break; - - if (buffer_delay(bh) || buffer_unwritten(bh)) { - - BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); - + if (!bh) + goto redirty_page; + if (map && (cur_logical >= map->m_lblk) && + (cur_logical <= (map->m_lblk + + (map->m_len - 1)))) { if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock; - } else { - /* - * unwritten already should have - * blocknr assigned. Verify that - */ - clear_buffer_unwritten(bh); - BUG_ON(bh->b_blocknr != pblock); } + if (buffer_unwritten(bh) || + buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + if (map->m_flags & EXT4_MAP_UNINIT) + set_buffer_uninit(bh); + clear_buffer_unwritten(bh); + } - } else if (buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); + /* redirty page if block allocation undone */ + if (buffer_delay(bh) || buffer_unwritten(bh)) + redirty_page = 1; + bh = bh->b_this_page; + block_start += bh->b_size; cur_logical++; pblock++; - } while ((bh = bh->b_this_page) != head); + } while (bh != page_bufs); + + if (redirty_page) + goto redirty_page; + + if (commit_write) + /* mark the buffer_heads as dirty & uptodate */ + block_commit_write(page, 0, len); + + /* + * Delalloc doesn't support data journalling, + * but eventually maybe we'll lift this + * restriction. + */ + if (unlikely(journal_data && PageChecked(page))) + err = __ext4_journalled_writepage(page, len); + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) + err = ext4_bio_write_page(&io_submit, page, + len, mpd->wbc); + else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); + + if (!err) + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + */ + if (ret == 0) + ret = err; } pagevec_release(&pvec); } + ext4_io_submit(&io_submit); + return ret; } - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, sector_t logical, long blk_cnt) { @@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * mpage_da_map_blocks - go through given space + * mpage_da_map_and_submit - go through given space, map them + * if necessary, and then submit them for I/O * * @mpd - bh describing space * * The function skips space we know is already mapped to disk blocks. * */ -static int mpage_da_map_blocks(struct mpage_da_data *mpd) +static void mpage_da_map_and_submit(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; - struct ext4_map_blocks map; + struct ext4_map_blocks map, *mapp = NULL; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; loff_t disksize = EXT4_I(mpd->inode)->i_disksize; handle_t *handle = NULL; /* - * We consider only non-mapped and non-allocated blocks + * If the blocks are mapped already, or we couldn't accumulate + * any blocks, then proceed immediately to the submission stage. */ - if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten))) - return 0; - - /* - * If we didn't accumulate anything to write simply return - */ - if (!mpd->b_size) - return 0; + if ((mpd->b_size == 0) || + ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten)))) + goto submit_io; handle = ext4_journal_current_handle(); BUG_ON(!handle); @@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't * want to change *many* call functions, so ext4_map_blocks() - * will set the magic i_delalloc_reserved_flag once the + * will set the EXT4_STATE_DELALLOC_RESERVED flag once the * inode's allocation semaphore is taken. * * If the blocks in questions were delalloc blocks, set @@ -2252,17 +2278,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) err = blks; /* - * If get block returns with error we simply - * return. Later writepage will redirty the page and - * writepages will find the dirty page again + * If get block returns EAGAIN or ENOSPC and there + * appears to be free blocks we will call + * ext4_writepage() for all of the pages which will + * just redirty the pages. */ if (err == -EAGAIN) - return 0; + goto submit_io; if (err == -ENOSPC && ext4_count_free_blocks(sb)) { mpd->retval = err; - return 0; + goto submit_io; } /* @@ -2287,10 +2314,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); - return err; + return; } BUG_ON(blks == 0); + mapp = ↦ if (map.m_flags & EXT4_MAP_NEW) { struct block_device *bdev = mpd->inode->i_sb->s_bdev; int i; @@ -2299,18 +2327,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) unmap_underlying_metadata(bdev, map.m_pblk + i); } - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if ((mpd->b_state & (1 << BH_Delay)) || - (mpd->b_state & (1 << BH_Unwritten))) - mpage_put_bnr_to_bhs(mpd, &map); - if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); if (err) - return err; + /* This only happens if the journal is aborted */ + return; } /* @@ -2321,10 +2342,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) disksize = i_size_read(mpd->inode); if (disksize > EXT4_I(mpd->inode)->i_disksize) { ext4_update_i_disksize(mpd->inode, disksize); - return ext4_mark_inode_dirty(handle, mpd->inode); + err = ext4_mark_inode_dirty(handle, mpd->inode); + if (err) + ext4_error(mpd->inode->i_sb, + "Failed to mark inode %lu dirty", + mpd->inode->i_ino); } - return 0; +submit_io: + mpage_da_submit_io(mpd, mapp); + mpd->io_done = 1; } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ @@ -2401,9 +2428,7 @@ flush_it: * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); - mpd->io_done = 1; + mpage_da_map_and_submit(mpd); return; } @@ -2422,9 +2447,9 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) * The function finds extents of pages and scan them for all blocks. */ static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, + struct mpage_da_data *mpd) { - struct mpage_da_data *mpd = data; struct inode *inode = mpd->inode; struct buffer_head *bh, *head; sector_t logical; @@ -2435,15 +2460,13 @@ static int __mpage_da_writepage(struct page *page, if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using writepage() + * and start IO on them */ if (mpd->next_page != mpd->first_page) { - if (mpage_da_map_blocks(mpd) == 0) - mpage_da_submit_io(mpd); + mpage_da_map_and_submit(mpd); /* * skip rest of the page in the page_vec */ - mpd->io_done = 1; redirty_page_for_writepage(wbc, page); unlock_page(page); return MPAGE_DA_EXTENT_TAIL; @@ -2550,8 +2573,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, if (buffer_delay(bh)) return 0; /* Not sure this could or should happen */ /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? + * XXX: __block_write_begin() unmaps passed block, is it OK? */ ret = ext4_da_reserve_space(inode, iblock); if (ret) @@ -2583,7 +2605,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write() and block_write_full_page(). + * callback function for block_write_begin() and block_write_full_page(). * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller @@ -2623,6 +2645,7 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; + ClearPageChecked(page); page_bufs = page_buffers(page); BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); @@ -2700,7 +2723,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int ext4_writepage(struct page *page, struct writeback_control *wbc) { - int ret = 0; + int ret = 0, commit_write = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; @@ -2713,71 +2736,44 @@ static int ext4_writepage(struct page *page, else len = PAGE_CACHE_SIZE; - if (page_has_buffers(page)) { - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation - * So redirty the page and return - * We may reach here when we do a journal commit - * via journal_submit_inode_data_buffers. - * If we don't have mapping block we just ignore - * them. We can also reach here via shrink_page_list - */ + /* + * If the page does not have buffers (for whatever reason), + * try to create them using __block_write_begin. If this + * fails, redirty the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } - } else { + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* - * The test for page_has_buffers() is subtle: - * We know the page is dirty but it lost buffers. That means - * that at some moment in time after write_begin()/write_end() - * has been called all buffers have been clean and thus they - * must have been written at least once. So they are all - * mapped and we can happily proceed with mapping them - * and writing the page. - * - * Try to initialize the buffer_heads and check whether - * all are mapped and non delay. We don't want to - * do block allocation here. + * We don't want to do block allocation, so redirty + * the page and return. We may reach here when we do + * a journal commit via journal_submit_inode_data_buffers. + * We can also reach here via shrink_page_list */ - ret = block_prepare_write(page, 0, len, - noalloc_get_block_write); - if (!ret) { - page_bufs = page_buffers(page); - /* check whether all are mapped and non delay */ - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - } else { - /* - * We can't do block allocation here - * so just redity the page and unlock - * and return - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } + goto redirty_page; + } + if (commit_write) /* now mark the buffer_heads as dirty and uptodate */ block_commit_write(page, 0, len); - } - if (PageChecked(page) && ext4_should_journal_data(inode)) { + if (PageChecked(page) && ext4_should_journal_data(inode)) /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ - ClearPageChecked(page); return __ext4_journalled_writepage(page, len); - } - if (page_bufs && buffer_uninit(page_bufs)) { + if (buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); @@ -2824,25 +2820,32 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) */ static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, - struct mpage_da_data *mpd) + struct mpage_da_data *mpd, + pgoff_t *done_index) { int ret = 0; int done = 0; struct pagevec pvec; - int nr_pages; + unsigned nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ long nr_to_write = wbc->nr_to_write; + int tag; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + *done_index = index; while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -2862,6 +2865,8 @@ static int write_cache_pages_da(struct address_space *mapping, break; } + *done_index = page->index + 1; + lock_page(page); /* @@ -2947,6 +2952,8 @@ static int ext4_da_writepages(struct address_space *mapping, long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t done_index = 0; + pgoff_t end; trace_ext4_da_writepages(inode, wbc); @@ -2982,8 +2989,11 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->range_start = index << PAGE_CACHE_SHIFT; wbc->range_end = LLONG_MAX; wbc->range_cyclic = 0; - } else + end = -1; + } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } /* * This works around two forms of stupidity. The first is in @@ -3002,9 +3012,12 @@ static int ext4_da_writepages(struct address_space *mapping, * sbi->max_writeback_mb_bump whichever is smaller. */ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) - desired_nr_to_write = wbc->nr_to_write * 8; - else + if (!range_cyclic && range_whole) { + if (wbc->nr_to_write == LONG_MAX) + desired_nr_to_write = wbc->nr_to_write; + else + desired_nr_to_write = wbc->nr_to_write * 8; + } else desired_nr_to_write = ext4_num_dirty_pages(inode, index, max_pages); if (desired_nr_to_write > max_pages) @@ -3021,6 +3034,9 @@ static int ext4_da_writepages(struct address_space *mapping, pages_skipped = wbc->pages_skipped; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!ret && wbc->nr_to_write > 0) { /* @@ -3059,16 +3075,14 @@ retry: mpd.io_done = 0; mpd.pages_written = 0; mpd.retval = 0; - ret = write_cache_pages_da(mapping, wbc, &mpd); + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit * them for I/O. */ if (!mpd.io_done && mpd.next_page != mpd.first_page) { - if (mpage_da_map_blocks(&mpd) == 0) - mpage_da_submit_io(&mpd); - mpd.io_done = 1; + mpage_da_map_and_submit(&mpd); ret = MPAGE_DA_EXTENT_TAIL; } trace_ext4_da_write_pages(inode, &mpd); @@ -3115,14 +3129,13 @@ retry: __func__, wbc->nr_to_write, ret); /* Update index */ - index += pages_written; wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = index; + mapping->writeback_index = done_index; out_writepages: wbc->nr_to_write -= nr_to_writebump; @@ -3367,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of - * the pages by calling redirty_page_for_writeback() but that + * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them becuase we wouldn't actually intend to @@ -3457,15 +3470,6 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - if (io->page) - put_page(io->page); - iput(io->inode); - kfree(io); -} - static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) { struct buffer_head *head, *bh; @@ -3642,173 +3646,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * check a range of space and convert unwritten extents to written. - */ -static int ext4_end_io_nolock(ext4_io_end_t *io) -{ - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - int ret = 0; - - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," - "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); - - if (list_empty(&io->list)) - return ret; - - if (io->flag != EXT4_IO_UNWRITTEN) - return ret; - - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d" - " io is still on inode %lu aio dio list\n", - __func__, ret, inode->i_ino); - return ret; - } - - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - /* clear the DIO AIO unwritten flag */ - io->flag = 0; - return ret; -} - -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) -{ - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret; - - mutex_lock(&inode->i_mutex); - ret = ext4_end_io_nolock(io); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - return; - } - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (!list_empty(&io->list)) - list_del_init(&io->list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - mutex_unlock(&inode->i_mutex); - ext4_free_io_end(io); -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - if (list_empty(&ei->i_completed_io_list)) - return ret; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * convertion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (ret < 0) - ret2 = ret; - else - list_del_init(&io->list); - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) -{ - ext4_io_end_t *io = NULL; - - io = kmalloc(sizeof(*io), flags); - - if (io) { - igrab(inode); - io->inode = inode; - io->flag = 0; - io->offset = 0; - io->size = 0; - io->page = NULL; - io->iocb = NULL; - io->result = 0; - INIT_WORK(&io->work, ext4_end_io_work); - INIT_LIST_HEAD(&io->list); - } - - return io; -} - static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) @@ -3828,7 +3665,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != EXT4_IO_UNWRITTEN){ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); iocb->private = NULL; out: @@ -3845,14 +3682,14 @@ out: } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); - /* Add the io_end to per-inode completed aio dio list*/ ei = EXT4_I(io_end->inode); spin_lock_irqsave(&ei->i_completed_io_lock, flags); list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); iocb->private = NULL; } @@ -3873,7 +3710,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_UNWRITTEN; + io_end->flag = EXT4_IO_END_UNWRITTEN; inode = io_end->inode; /* Add the io_end to per-inode completed io list*/ @@ -3901,8 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) retry: io_end = ext4_init_io_end(inode, GFP_ATOMIC); if (!io_end) { - if (printk_ratelimit()) - printk(KERN_WARNING "%s: allocation fail\n", __func__); + pr_warn_ratelimited("%s: allocation fail\n", __func__); schedule(); goto retry; } @@ -3926,9 +3762,9 @@ retry: * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as unintialized + * For holes, we fallocate those blocks, mark them as uninitialized * If those blocks were preallocated, we mark sure they are splited, but - * still keep the range to write as unintialized. + * still keep the range to write as uninitialized. * * The unwrritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -4226,7 +4062,7 @@ int ext4_block_truncate_page(handle_t *handle, if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); } else { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode) err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } @@ -4350,6 +4186,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, { __le32 *p; int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; @@ -4365,11 +4202,23 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; + } + err = ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); + if (unlikely(err)) { + ext4_std_error(inode->i_sb, err); + return 1; } - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); @@ -4530,6 +4379,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); + brelse(bh); /* * Everything below this this pointer has been @@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state_flags = 0; + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = @@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; int error, rc = 0; + int orphan = 0; const unsigned int ia_valid = attr->ia_valid; error = inode_change_ok(inode, attr); @@ -5519,8 +5370,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - - error = ext4_orphan_add(handle, inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5538,6 +5391,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } ext4_orphan_del(handle, inode); + orphan = 0; ext4_journal_stop(handle); goto err_out; } @@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ - if (inode->i_nlink) + if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) @@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; return 0; @@ -5643,7 +5495,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5831,6 +5683,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae883b1b..eb3bc2fe647 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,6 +331,30 @@ mext_out: return err; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4b4ad4b7ce5..851f49b2f9d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -338,6 +338,14 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_ext_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES \ + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1) +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, @@ -939,6 +947,85 @@ out: } /* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, + ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= ngroups) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write_nested(&grp->alloc_sem, i); + } + return i; +} + +static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +/* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! @@ -1915,84 +2002,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } -/* - * lock the group_info alloc_sem of all the groups - * belonging to the same buddy cache page. This - * make sure other parallel operation on the buddy - * cache doesn't happen whild holding the buddy cache - * lock - */ -int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) -{ - int i; - int block, pnum; - int blocks_per_page; - int groups_per_page; - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - - if ((first_group + i) >= ngroups) - break; - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - down_write_nested(&grp->alloc_sem, i); - } - return i; -} - -void ext4_mb_put_buddy_cache_lock(struct super_block *sb, - ext4_group_t group, int locked_group) -{ - int i; - int block, pnum; - int blocks_per_page; - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - /* release locks on all the groups */ - for (i = 0; i < locked_group; i++) { - - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - up_write(&grp->alloc_sem); - } - -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -2233,15 +2242,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { - int i, len; + int i; int metalen = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. @@ -2261,22 +2279,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = offsetof(typeof(**meta_group_info), - bb_counters[sb->s_blocksize_bits + 2]); - meta_group_info = sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kzalloc(len, GFP_KERNEL); + meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); goto exit_group_info; } + memset(meta_group_info[i], 0, kmem_cache_size(cachep)); set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); @@ -2331,6 +2343,7 @@ static int ext4_mb_init_backend(struct super_block *sb) int num_meta_group_infos_max; int array_size; struct ext4_group_desc *desc; + struct kmem_cache *cachep; /* This is the number of blocks used by GDT */ num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - @@ -2373,6 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb) printk(KERN_ERR "EXT4-fs: can't get new inode\n"); goto err_freesgi; } + sbi->s_buddy_cache->i_ino = get_next_ino(); EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); @@ -2388,8 +2402,9 @@ static int ext4_mb_init_backend(struct super_block *sb) return 0; err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) - kfree(ext4_get_group_info(sb, i)); + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); i = num_meta_group_infos; while (i-- > 0) kfree(sbi->s_group_info[i]); @@ -2406,19 +2421,48 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned offset; unsigned max; int ret; + int cache_index; + struct kmem_cache *cachep; + char *namep = NULL; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - return -ENOMEM; + ret = -ENOMEM; + goto out; } i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_offsets); - return -ENOMEM; + ret = -ENOMEM; + goto out; + } + + cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + cachep = ext4_groupinfo_caches[cache_index]; + if (!cachep) { + char name[32]; + int len = offsetof(struct ext4_group_info, + bb_counters[sb->s_blocksize_bits + 2]); + + sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits); + namep = kstrdup(name, GFP_KERNEL); + if (!namep) { + ret = -ENOMEM; + goto out; + } + + /* Need to free the kmem_cache_name() when we + * destroy the slab */ + cachep = kmem_cache_create(namep, len, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!cachep) { + ret = -ENOMEM; + goto out; + } + ext4_groupinfo_caches[cache_index] = cachep; } /* order 0 is regular bitmap */ @@ -2439,9 +2483,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return ret; + goto out; } spin_lock_init(&sbi->s_md_lock); @@ -2456,9 +2498,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return -ENOMEM; + ret = -ENOMEM; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2475,7 +2516,13 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; +out: + if (ret) { + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + kfree(namep); + } + return ret; } /* need to called with the ext4 group lock held */ @@ -2503,6 +2550,7 @@ int ext4_mb_release(struct super_block *sb) int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { @@ -2513,7 +2561,7 @@ int ext4_mb_release(struct super_block *sb) ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); - kfree(grinfo); + kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> @@ -2557,20 +2605,15 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static inline void ext4_issue_discard(struct super_block *sb, +static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t block, int count) { - int ret; ext4_fsblk_t discard_block; discard_block = block + ext4_group_first_block_no(sb, block_group); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - ret = sb_issue_discard(sb, discard_block, count); - if (ret == EOPNOTSUPP) { - ext4_warning(sb, "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); - } + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } /* @@ -2582,7 +2625,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, ret, count = 0, count2 = 0; struct ext4_free_data *entry; struct list_head *l, *ltmp; @@ -2592,9 +2635,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, + if (test_opt(sb, DISCARD)) { + ret = ext4_issue_discard(sb, entry->group, entry->start_blk, entry->count); + if (unlikely(ret == -EOPNOTSUPP)) { + ext4_warning(sb, "discard not supported, " + "disabling"); + clear_opt(sb, DISCARD); + } + } err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2658,28 +2707,22 @@ static void ext4_remove_debugfs_entry(void) #endif -int __init init_ext4_mballoc(void) +int __init ext4_init_mballoc(void) { - ext4_pspace_cachep = - kmem_cache_create("ext4_prealloc_space", - sizeof(struct ext4_prealloc_space), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; - ext4_ac_cachep = - kmem_cache_create("ext4_alloc_context", - sizeof(struct ext4_allocation_context), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } - ext4_free_ext_cachep = - kmem_cache_create("ext4_free_block_extents", - sizeof(struct ext4_free_data), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); if (ext4_free_ext_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); @@ -2689,8 +2732,9 @@ int __init init_ext4_mballoc(void) return 0; } -void exit_ext4_mballoc(void) +void ext4_exit_mballoc(void) { + int i; /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. @@ -2699,6 +2743,15 @@ void exit_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + struct kmem_cache *cachep = ext4_groupinfo_caches[i]; + if (cachep) { + char *name = (char *)kmem_cache_name(cachep); + kmem_cache_destroy(cachep); + kfree(name); + } + } ext4_remove_debugfs_entry(); } @@ -3535,8 +3588,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3554,11 +3606,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - } - while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) @@ -3569,16 +3616,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, (unsigned) next - bit, (unsigned) group); free += next - bit; - if (ac) { - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = next - bit; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } - - trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, - next - bit); + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, + grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3601,29 +3641,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, ac, pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = NULL; - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = pa->pa_len; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } @@ -3644,7 +3674,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; struct list_head list; struct ext4_buddy e4b; int err; @@ -3673,9 +3702,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3730,9 +3756,9 @@ repeat: spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); @@ -3740,8 +3766,6 @@ repeat: out: ext4_unlock_group(sb, group); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; @@ -3762,7 +3786,6 @@ void ext4_discard_preallocations(struct inode *inode) struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; @@ -3778,11 +3801,6 @@ void ext4_discard_preallocations(struct inode *inode) INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = inode; - } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -3852,7 +3870,7 @@ repeat: ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -3861,23 +3879,8 @@ repeat: list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } -/* - * finds all preallocated spaces and return blocks being freed to them - * if preallocated space becomes full (no block is used from the space) - * then the function frees space in buddy - * XXX: at the moment, truncate (which is the only way to free blocks) - * discards all preallocations - */ -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, - sector_t block, int count) -{ - BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); -} #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { @@ -4060,14 +4063,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4119,15 +4118,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4273,7 +4270,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * EDQUOT check, as blocks and quotas have been already * reserved when data being copied into pagecache. */ - if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) ar->flags |= EXT4_MB_DELALLOC_RESERVED; else { /* Without delayed allocation we need to verify @@ -4370,7 +4367,8 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); if (!ar->len) { - if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); @@ -4491,7 +4489,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; unsigned long freed = 0; unsigned int overflow; @@ -4531,6 +4528,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); + if (unlikely(!tbh)) + continue; ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); } @@ -4546,12 +4545,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_inode = inode; - ac->ac_sb = sb; - } - do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4609,12 +4602,7 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - if (ac) { - ac->ac_b_ex.fe_group = block_group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = count; - trace_ext4_mballoc_free(ac); - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4626,7 +4614,11 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + if (!new_entry) { + err = -ENOMEM; + goto error_return; + } new_entry->start_blk = bit; new_entry->group = block_group; new_entry->count = count; @@ -4643,9 +4635,6 @@ do_more: ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); - ext4_mb_return_to_preallocation(inode, &e4b, block, count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4685,7 +4674,194 @@ error_return: dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return; } + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex; + int ret = 0; + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + + ret = ext4_issue_discard(sb, group, start, count); + + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @e4b: ext4 buddy + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0; + ext4_group_t group; + int ret = 0; + + BUG_ON(e4b == NULL); + + bitmap = e4b->bd_bitmap; + group = e4b->bd_group; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + ext4_lock_group(sb, group); + + while (start < max) { + start = mb_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = mb_find_next_bit(bitmap, max, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, + next - start, group, e4b); + if (ret < 0) + break; + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b->bd_info->bb_free - count) < minblocks) + break; + } + ext4_unlock_group(sb, group); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + if (ret < 0) + count = ret; + + return count; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_buddy e4b; + ext4_group_t first_group, last_group; + ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_grpblk_t cnt = 0, first_block, last_block; + uint64_t start, len, minlen, trimmed; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + len = range->len >> sb->s_blocksize_bits; + minlen = range->minlen >> sb->s_blocksize_bits; + trimmed = 0; + + if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) + return -EINVAL; + if (start < first_data_blk) { + len -= first_data_blk - start; + start = first_data_blk; + } + + /* Determine first and last group to examine based on start and len */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_block); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + &last_group, &last_block); + last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; + last_block = EXT4_BLOCKS_PER_GROUP(sb); + + if (first_group > last_group) + return -EINVAL; + + for (group = first_group; group <= last_group; group++) { + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + break; + } + + if (len >= EXT4_BLOCKS_PER_GROUP(sb)) + len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); + else + last_block = first_block + len; + + if (e4b.bd_info->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, &e4b, first_block, + last_block, minlen); + if (cnt < 0) { + ret = cnt; + ext4_mb_unload_buddy(&e4b); + break; + } + } + ext4_mb_unload_buddy(&e4b); + trimmed += cnt; + first_block = 0; + } + range->len = trimmed * sb->s_blocksize; + + return ret; +} diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 1765c2c50a9..b0a126f23c2 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -412,7 +412,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct buffer_head *bh; struct ext4_extent_header *eh; - block = idx_pblock(ix); + block = ext4_idx_pblock(ix); bh = sb_bread(inode->i_sb, block); if (!bh) return -EIO; @@ -496,7 +496,7 @@ int ext4_ext_migrate(struct inode *inode) goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, 0, goal); + S_IFREG, NULL, goal); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5f1ed9fc913..b9f3e7862f1 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -85,7 +85,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext_pblock(path[ppos].p_ext); + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); return 0; } @@ -96,7 +96,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, /* index block */ path[ppos].p_idx++; - path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); if (path[ppos+1].p_bh) brelse(path[ppos+1].p_bh); path[ppos+1].p_bh = @@ -111,7 +111,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, path[cur_ppos].p_idx = EXT_FIRST_INDEX(path[cur_ppos].p_hdr); path[cur_ppos].p_block = - idx_pblock(path[cur_ppos].p_idx); + ext4_idx_pblock(path[cur_ppos].p_idx); if (path[cur_ppos+1].p_bh) brelse(path[cur_ppos+1].p_bh); path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, @@ -133,7 +133,7 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); path[leaf_ppos].p_block = - ext_pblock(path[leaf_ppos].p_ext); + ext4_ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -249,7 +249,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, */ o_end->ee_block = end_ext->ee_block; o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); } o_start->ee_len = start_ext->ee_len; @@ -276,7 +276,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, */ o_end->ee_block = end_ext->ee_block; o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); /* * Set 0 to the extent block if new_ext was @@ -361,7 +361,7 @@ mext_insert_inside_block(struct ext4_extent *o_start, /* Insert new entry */ if (new_ext->ee_len) { o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); + ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); } /* Insert end entry */ @@ -488,7 +488,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, start_ext.ee_len = end_ext.ee_len = 0; new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); + ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); new_ext.ee_len = dext->ee_len; new_ext_alen = ext4_ext_get_actual_len(&new_ext); new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; @@ -553,7 +553,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, copy_extent_status(oext, &end_ext); end_ext_alen = ext4_ext_get_actual_len(&end_ext); ext4_ext_store_pblock(&end_ext, - (ext_pblock(o_end) + oext_alen - end_ext_alen)); + (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); end_ext.ee_block = cpu_to_le32(le32_to_cpu(o_end->ee_block) + oext_alen - end_ext_alen); @@ -604,7 +604,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, /* When tmp_dext is too large, pick up the target range. */ diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); + ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); tmp_dext->ee_block = cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); @@ -613,7 +613,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, tmp_dext->ee_len = cpu_to_le16(max_count); orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); + ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); /* Adjust extent length if donor extent is larger than orig */ if (ext4_ext_get_actual_len(tmp_dext) > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 314c0d3b3fa..5485390d32c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -581,9 +581,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry(dir, de, bh, - (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) - +((char *)de - bh->b_data))) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ dir_file->f_pos = (dir_file->f_pos | (dir->i_sb->s_blocksize - 1)) + 1; @@ -820,7 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -856,6 +856,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block, b; + const u8 *name = d_name->name; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead @@ -870,6 +871,16 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } if (is_dx(dir)) { bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); /* @@ -960,55 +971,35 @@ cleanup_and_exit: static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *err) { - struct super_block * sb; + struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; - u32 hash; struct dx_frame frames[2], *frame; - struct ext4_dir_entry_2 *de, *top; struct buffer_head *bh; ext4_lblk_t block; int retval; - int namelen = d_name->len; - const u8 *name = d_name->name; - sb = dir->i_sb; - /* NFS may look up ".." - look at dx_root directory block */ - if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; - } else { - frame = frames; - frame->bh = NULL; /* for dx_release() */ - frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ - dx_set_block(frame->at, 0); /* dx_root block is 0 */ - } - hash = hinfo.hash; + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) + return NULL; do { block = dx_get_block(frame->at); - if (!(bh = ext4_bread (NULL,dir, block, 0, err))) + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) goto errout; - de = (struct ext4_dir_entry_2 *) bh->b_data; - top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { - int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) - + ((char *) de - bh->b_data); - - if (!ext4_check_dir_entry(dir, de, bh, off)) { - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto errout; - } - if (ext4_match(namelen, name, de)) { - *res_dir = de; - dx_release(frames); - return bh; - } + retval = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { /* Success! */ + dx_release(frames); + return bh; } brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hash, frame, + retval = ext4_htree_next_block(dir, hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning(sb, @@ -1045,7 +1036,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (unlikely(IS_ERR(inode))) { + if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ESTALE) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1278,7 +1269,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry(dir, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1611,7 +1602,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1639,17 +1634,21 @@ static int ext4_delete_entry(handle_t *handle, { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int i; + int i, err; i = 0; pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry(dir, de, bh, i)) + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); - ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -1661,7 +1660,11 @@ static int ext4_delete_entry(handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -1798,7 +1801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) { handle_t *handle; struct inode *inode; - struct buffer_head *dir_block; + struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; unsigned int blocksize = dir->i_sb->s_blocksize; int err, retries = 0; @@ -1831,7 +1834,9 @@ retry: if (!dir_block) goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); - ext4_journal_get_write_access(handle, dir_block); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; de = (struct ext4_dir_entry_2 *) dir_block->b_data; de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; @@ -1848,10 +1853,12 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, dir, dir_block); - brelse(dir_block); - ext4_mark_inode_dirty(handle, inode); - err = ext4_add_entry(handle, dentry, inode); + err = ext4_handle_dirty_metadata(handle, dir, dir_block); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); @@ -1862,10 +1869,13 @@ out_clear_inode: } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; d_instantiate(dentry, inode); unlock_new_inode(inode); out_stop: + brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -1928,7 +1938,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry(inode, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; @@ -2312,7 +2322,7 @@ retry: inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); - atomic_inc(&inode->i_count); + ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { @@ -2416,7 +2426,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, new_dir, new_bh); + retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } brelse(new_bh); new_bh = NULL; } @@ -2468,7 +2482,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + if (retval) { + ext4_std_error(old_dir->i_sb, retval); + goto end_rename; + } ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c new file mode 100644 index 00000000000..7270dcfca92 --- /dev/null +++ b/fs/ext4/page-io.c @@ -0,0 +1,428 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +static struct kmem_cache *io_page_cachep, *io_end_cachep; + +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + +int __init ext4_init_pageio(void) +{ + int i; + + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) + return -ENOMEM; + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_end_cachep == NULL) { + kmem_cache_destroy(io_page_cachep); + return -ENOMEM; + } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); + + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_page_cachep); +} + +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + +void ext4_free_io_end(ext4_io_end_t *io) +{ + int i; + wait_queue_head_t *wq; + + BUG_ON(!io); + if (io->page) + put_page(io->page); + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); + io->num_io_pages = 0; + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); + kmem_cache_free(io_end_cachep, io); +} + +/* + * check a range of space and convert unwritten extents to written. + */ +int ext4_end_io_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (!(io->flag & EXT4_IO_END_UNWRITTEN)) + return ret; + + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten " + "extents to written extents, error is %d " + "io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + if (io->iocb) + aio_complete(io->iocb, io->result, 0); + /* clear the DIO AIO unwritten flag */ + io->flag &= ~EXT4_IO_END_UNWRITTEN; + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; + } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); + if (io) { + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; + INIT_WORK(&io->work, ext4_end_io_work); + INIT_LIST_HEAD(&io->list); + } + return io; +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + int i; + + BUG_ON(!io_end); + bio->bi_private = NULL; + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + bio_put(bio); + + for (i = 0; i < io_end->num_io_pages; i++) { + struct page *page = io_end->pages[i]->p_page; + struct buffer_head *bh, *head; + int partial_write = 0; + + head = page_buffers(page); + if (error) + SetPageError(page); + BUG_ON(!head); + if (head->b_size == PAGE_CACHE_SIZE) + clear_buffer_dirty(head); + else { + loff_t offset; + loff_t io_end_offset = io_end->offset + io_end->size; + + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; + bh = head; + do { + if ((offset >= io_end->offset) && + (offset+bh->b_size <= io_end_offset)) { + if (error) + buffer_io_error(bh); + + clear_buffer_dirty(bh); + } + if (buffer_delay(bh)) + partial_write = 1; + else if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + else if (buffer_dirty(bh)) + partial_write = 1; + offset += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + /* + * If this is a partial write which happened to make + * all buffers uptodate then we can optimize away a + * bogus readpage() for the next read(). Here we + * 'discover' whether the page went uptodate as a + * result of this (potentially partial) write. + */ + if (!partial_write) + SetPageUptodate(page); + + put_io_page(io_end->pages[i]); + } + io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = 0; + io->io_op = 0; + io->io_end = 0; +} + +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + do { + bio = bio_alloc(GFP_NOIO, nvecs); + nvecs >>= 1; + } while (bio == NULL); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; + bio->bi_end_io = ext4_end_bio; + + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct ext4_io_page *io_page, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + int ret; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (!buffer_mapped(bh) || buffer_delay(bh)) { + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + return 0; + } + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init(io, inode, wbc, bh); + if (ret) + return ret; + } + io_end = io->io_end; + if ((io_end->num_io_pages >= MAX_IO_PAGES) && + (io_end->pages[io_end->num_io_pages-1] != io_page)) + goto submit_and_retry; + if (buffer_uninit(bh)) + io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + io->io_end->size += bh->b_size; + io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + if ((io_end->num_io_pages == 0) || + (io_end->pages[io_end->num_io_pages-1] != io_page)) { + io_end->pages[io_end->num_io_pages++] = io_page; + atomic_inc(&io_page->p_count); + } + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + struct ext4_io_page *io_page; + struct buffer_head *bh, *head; + int ret = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + ClearPageError(page); + + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); + if (!io_page) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } + io_page->p_page = page; + atomic_set(&io_page->p_count, 1); + get_page(page); + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_start >= len) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + set_page_dirty(page); + break; + } + } + unlock_page(page); + /* + * If the page was truncated before we could do the writeback, + * or we had a memory allocation error while trying to write + * the first buffer head, we won't have submitted any pages for + * I/O. In that case we need to make sure we've cleared the + * PageWriteback bit from the page to prevent the system from + * wedging later on. + */ + put_io_page(io_page); + return ret; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ca5c8aa00a2..3ecc6e45d2f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -220,29 +220,25 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_handle_dirty_metadata(handle, NULL, gdb); + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto exit_bh; + } ext4_set_bit(bit, bh->b_data); brelse(gdb); } /* Zero out all of the reserved backup group descriptor table blocks */ - for (i = 0, bit = gdblocks + 1, block = start + bit; - i < reserved_gdb; i++, block++, bit++) { - struct buffer_head *gdb; - - ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; - - if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(gdb); - goto exit_bh; - } - ext4_handle_dirty_metadata(handle, NULL, gdb); + ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, + GFP_NOFS); + if (err) + goto exit_bh; + for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) ext4_set_bit(bit, bh->b_data); - brelse(gdb); - } + ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); ext4_set_bit(input->block_bitmap - start, bh->b_data); @@ -251,29 +247,26 @@ static int setup_new_group_blocks(struct super_block *sb, ext4_set_bit(input->inode_bitmap - start, bh->b_data); /* Zero out all of the inode table blocks */ - for (i = 0, block = input->inode_table, bit = block - start; - i < sbi->s_itb_per_group; i++, bit++, block++) { - struct buffer_head *it; - - ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; - - if (IS_ERR(it = bclean(handle, sb, block))) { - err = PTR_ERR(it); - goto exit_bh; - } - ext4_handle_dirty_metadata(handle, NULL, it); - brelse(it); + block = input->inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); + if (err) + goto exit_bh; + for (i = 0, bit = input->inode_table - start; + i < sbi->s_itb_per_group; i++, bit++) ext4_set_bit(bit, bh->b_data); - } if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; - mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, + bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_bh; + } brelse(bh); /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", @@ -283,9 +276,11 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - ext4_handle_dirty_metadata(handle, NULL, bh); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); exit_bh: brelse(bh); @@ -437,17 +432,21 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) goto exit_dind; - if ((err = ext4_journal_get_write_access(handle, *primary))) + err = ext4_journal_get_write_access(handle, *primary); + if (unlikely(err)) goto exit_sbh; - if ((err = ext4_journal_get_write_access(handle, dind))) - goto exit_primary; + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); /* ext4_reserve_inode_write() gets a reference on the iloc */ - if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) goto exit_dindj; n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -469,12 +468,20 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_handle_dirty_metadata(handle, NULL, dind); - brelse(dind); + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_handle_dirty_metadata(handle, NULL, *primary); + err = ext4_handle_dirty_metadata(handle, NULL, *primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -485,19 +492,19 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); - return 0; + return err; exit_inode: /* ext4_journal_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: /* ext4_journal_release_buffer(handle, dind); */ -exit_primary: - /* ext4_journal_release_buffer(handle, *primary); */ exit_sbh: - /* ext4_journal_release_buffer(handle, *primary); */ + /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -680,7 +687,9 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_handle_dirty_metadata(handle, NULL, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -898,7 +907,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_handle_dirty_metadata(handle, NULL, primary); + err = ext4_handle_dirty_metadata(handle, NULL, primary); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_journal; + } /* Update the reserved block counts only once the new group is * active. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26147746c27..48ce561fafa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> @@ -41,6 +40,9 @@ #include <linux/crc16.h> #include <asm/uaccess.h> +#include <linux/kthread.h> +#include <linux/freezer.h> + #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -50,8 +52,11 @@ #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> -struct proc_dir_entry *ext4_proc_root; +static struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; +struct ext4_lazy_init *ext4_li_info; +struct mutex ext4_li_mtx; +struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -68,14 +73,16 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); +static void ext4_destroy_lazyinit_thread(void); +static void ext4_unregister_li_request(struct super_block *sb); #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -381,13 +388,14 @@ static void ext4_handle_error(struct super_block *sb) void __ext4_error(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", - sb->s_id, function, line, current->comm); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); va_end(args); ext4_handle_error(sb); @@ -398,28 +406,31 @@ void ext4_error_inode(struct inode *inode, const char *function, const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); save_error_info(inode->i_sb, function, line); va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk("block %llu: ", block); - printk("comm %s: ", current->comm); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu: ", block); + printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); } void ext4_error_file(struct file *file, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; + struct va_format vaf; struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; @@ -427,17 +438,18 @@ void ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); save_error_info(inode->i_sb, function, line); - va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (!path) + if (IS_ERR(path)) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu " - "(comm %s path %s): ", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, path); - vprintk(fmt, args); - printk("\n"); + "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk(KERN_CONT "block %llu: ", block); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -536,28 +548,29 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg (struct super_block * sb, const char *prefix, - const char *fmt, ...) +void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk("%sEXT4-fs (%s): ", prefix, sb->s_id); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); va_end(args); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", - sb->s_id, function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); } @@ -568,21 +581,25 @@ void __ext4_grp_locked_error(const char *function, unsigned int line, __releases(bitlock) __acquires(bitlock) { + struct va_format vaf; va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", sb->s_id, function, line, grp); if (ino) - printk("inode %lu: ", ino); + printk(KERN_CONT "inode %lu: ", ino); if (block) - printk("block %llu:", (unsigned long long) block); - vprintk(fmt, args); - printk("\n"); + printk(KERN_CONT "block %llu:", (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); va_end(args); if (test_opt(sb, ERRORS_CONT)) { @@ -640,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -656,8 +673,7 @@ fail: */ static int ext4_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext4_blkdev_remove(struct ext4_sb_info *sbi) @@ -702,13 +718,13 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); lock_super(sb); - lock_kernel(); if (sb->s_dirt) ext4_commit_super(sb, 1); @@ -719,6 +735,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -775,7 +792,6 @@ static void ext4_put_super(struct super_block *sb) * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ - unlock_kernel(); unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); @@ -801,32 +817,43 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); - /* - * Note: We can be called before EXT4_SB(sb)->s_journal is set, - * therefore it can be null here. Don't check it, just initialize - * jinode. - */ - jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; - ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif + ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -836,7 +863,7 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } - kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); + call_rcu(&inode->i_rcu, ext4_i_callback); } static void init_once(void *foo) @@ -874,9 +901,12 @@ void ext4_clear_inode(struct inode *inode) end_writeback(inode); dquot_drop(inode); ext4_discard_preallocations(inode); - if (EXT4_JOURNAL(inode)) - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, - &EXT4_I(inode)->jinode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } } static inline void ext4_show_quota_options(struct seq_file *seq, @@ -1009,6 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); + if (test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",mblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1045,6 +1077,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) seq_puts(seq, ",block_validity"); + if (!test_opt(sb, INIT_INODE_TABLE)) + seq_puts(seq, ",noinit_inode_table"); + else if (sbi->s_li_wait_mult) + seq_printf(seq, ",init_inode_table=%u", + (unsigned) sbi->s_li_wait_mult); + ext4_show_quota_options(seq, sb); return 0; @@ -1123,7 +1161,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path); + struct path *path); static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, @@ -1160,6 +1198,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1180,6 +1219,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -1214,11 +1254,12 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, + Opt_init_inode_table, Opt_noinit_inode_table, }; static const match_table_t tokens = { @@ -1278,6 +1319,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1289,6 +1332,9 @@ static const match_table_t tokens = { {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, + {Opt_init_inode_table, "init_itable=%u"}, + {Opt_init_inode_table, "init_itable"}, + {Opt_noinit_inode_table, "noinit_itable"}, {Opt_err, NULL}, }; @@ -1353,7 +1399,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) sbi->s_qf_names[qtype] = NULL; return 0; } - set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sb, QUOTA); return 1; } @@ -1408,21 +1454,21 @@ static int parse_options(char *options, struct super_block *sb, switch (token) { case Opt_bsd_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, MINIX_DF); + clear_opt(sb, MINIX_DF); break; case Opt_minix_df: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, MINIX_DF); + set_opt(sb, MINIX_DF); break; case Opt_grpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); break; case Opt_nogrpid: ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sbi->s_mount_opt, GRPID); + clear_opt(sb, GRPID); break; case Opt_resuid: @@ -1440,38 +1486,38 @@ static int parse_options(char *options, struct super_block *sb, /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_RO); - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + set_opt(sb, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt(sbi->s_mount_opt, ERRORS_CONT); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sb, ERRORS_CONT); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_RO); break; case Opt_err_cont: - clear_opt(sbi->s_mount_opt, ERRORS_RO); - clear_opt(sbi->s_mount_opt, ERRORS_PANIC); - set_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sb, ERRORS_RO); + clear_opt(sb, ERRORS_PANIC); + set_opt(sb, ERRORS_CONT); break; case Opt_nouid32: - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); break; case Opt_debug: - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); break; case Opt_oldalloc: - set_opt(sbi->s_mount_opt, OLDALLOC); + set_opt(sb, OLDALLOC); break; case Opt_orlov: - clear_opt(sbi->s_mount_opt, OLDALLOC); + clear_opt(sb, OLDALLOC); break; #ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt(sbi->s_mount_opt, XATTR_USER); + clear_opt(sb, XATTR_USER); break; #else case Opt_user_xattr: @@ -1481,10 +1527,10 @@ static int parse_options(char *options, struct super_block *sb, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); break; case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); + clear_opt(sb, POSIX_ACL); break; #else case Opt_acl: @@ -1503,7 +1549,7 @@ static int parse_options(char *options, struct super_block *sb, "Cannot specify journal on remount"); return 0; } - set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); + set_opt(sb, UPDATE_JOURNAL); break; case Opt_journal_dev: if (is_remount) { @@ -1516,14 +1562,14 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_journal_async_commit: - set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + set_opt(sb, JOURNAL_ASYNC_COMMIT); + set_opt(sb, JOURNAL_CHECKSUM); break; case Opt_noload: - set_opt(sbi->s_mount_opt, NOLOAD); + set_opt(sb, NOLOAD); break; case Opt_commit: if (match_int(&args[0], &option)) @@ -1566,15 +1612,15 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); + clear_opt(sb, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; case Opt_data_err_abort: - set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + set_opt(sb, DATA_ERR_ABORT); break; case Opt_data_err_ignore: - clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + clear_opt(sb, DATA_ERR_ABORT); break; #ifdef CONFIG_QUOTA case Opt_usrjquota: @@ -1614,12 +1660,12 @@ set_qf_format: break; case Opt_quota: case Opt_usrquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, USRQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, USRQUOTA); break; case Opt_grpquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, GRPQUOTA); + set_opt(sb, QUOTA); + set_opt(sb, GRPQUOTA); break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { @@ -1627,9 +1673,9 @@ set_qf_format: "options when quota turned on"); return 0; } - clear_opt(sbi->s_mount_opt, QUOTA); - clear_opt(sbi->s_mount_opt, USRQUOTA); - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, QUOTA); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); break; #else case Opt_quota: @@ -1655,7 +1701,7 @@ set_qf_format: sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; break; case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_barrier: if (args[0].from) { @@ -1664,9 +1710,9 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); else - clear_opt(sbi->s_mount_opt, BARRIER); + clear_opt(sb, BARRIER); break; case Opt_ignore: break; @@ -1690,11 +1736,17 @@ set_qf_format: "Ignoring deprecated bh option"); break; case Opt_i_version: - set_opt(sbi->s_mount_opt, I_VERSION); + set_opt(sb, I_VERSION); sb->s_flags |= MS_I_VERSION; break; case Opt_nodelalloc: - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); + break; + case Opt_mblk_io_submit: + set_opt(sb, MBLK_IO_SUBMIT); + break; + case Opt_nomblk_io_submit: + clear_opt(sb, MBLK_IO_SUBMIT); break; case Opt_stripe: if (match_int(&args[0], &option)) @@ -1704,13 +1756,13 @@ set_qf_format: sbi->s_stripe = option; break; case Opt_delalloc: - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); break; case Opt_block_validity: - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); break; case Opt_noblock_validity: - clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + clear_opt(sb, BLOCK_VALIDITY); break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) @@ -1734,7 +1786,7 @@ set_qf_format: option); break; case Opt_noauto_da_alloc: - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb, NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: if (args[0].from) { @@ -1743,21 +1795,35 @@ set_qf_format: } else option = 1; /* No argument, default to 1 */ if (option) - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + clear_opt(sb, NO_AUTO_DA_ALLOC); else - set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + set_opt(sb,NO_AUTO_DA_ALLOC); break; case Opt_discard: - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); break; case Opt_nodiscard: - clear_opt(sbi->s_mount_opt, DISCARD); + clear_opt(sb, DISCARD); break; case Opt_dioread_nolock: - set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + set_opt(sb, DIOREAD_NOLOCK); break; case Opt_dioread_lock: - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); + break; + case Opt_init_inode_table: + set_opt(sb, INIT_INODE_TABLE); + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = EXT4_DEF_LI_WAIT_MULT; + if (option < 0) + return 0; + sbi->s_li_wait_mult = option; + break; + case Opt_noinit_inode_table: + clear_opt(sb, INIT_INODE_TABLE); break; default: ext4_msg(sb, KERN_ERR, @@ -1769,10 +1835,10 @@ set_qf_format: #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sb, USRQUOTA); if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sbi->s_mount_opt, GRPQUOTA); + clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " @@ -1842,12 +1908,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " - "bpg=%lu, ipg=%lu, mo=%04x]\n", + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), - sbi->s_mount_opt); + sbi->s_mount_opt, sbi->s_mount_opt2); return res; } @@ -1877,14 +1943,13 @@ static int ext4_fill_flex_info(struct super_block *sb) size = flex_group_count * sizeof(struct flex_groups); sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vmalloc(size); - if (sbi->s_flex_groups) - memset(sbi->s_flex_groups, 0, size); - } - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for " - "%u flex groups", flex_group_count); - goto failed; + sbi->s_flex_groups = vzalloc(size); + if (sbi->s_flex_groups == NULL) { + ext4_msg(sb, KERN_ERR, + "not enough memory for %u flex groups", + flex_group_count); + goto failed; + } } for (i = 0; i < sbi->s_groups_count; i++) { @@ -1942,7 +2007,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, } /* Called at mount-time, super-block is locked */ -static int ext4_check_descriptors(struct super_block *sb) +static int ext4_check_descriptors(struct super_block *sb, + ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -1951,7 +2017,7 @@ static int ext4_check_descriptors(struct super_block *sb) ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; - ext4_group_t i; + ext4_group_t i, grp = sbi->s_groups_count; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) flexbg_flag = 1; @@ -1967,6 +2033,10 @@ static int ext4_check_descriptors(struct super_block *sb) last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2004,6 +2074,8 @@ static int ext4_check_descriptors(struct super_block *sb) if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); @@ -2376,6 +2448,7 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) +#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ @@ -2412,6 +2485,16 @@ static struct attribute *ext4_attrs[] = { NULL, }; +/* Features this copy of ext4 supports */ +EXT4_INFO_ATTR(lazy_itable_init); +EXT4_INFO_ATTR(batched_discard); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + NULL, +}; + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -2440,7 +2523,6 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } - static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -2452,6 +2534,17 @@ static struct kobj_type ext4_ktype = { .release = ext4_sb_release, }; +static void ext4_feat_release(struct kobject *kobj) +{ + complete(&ext4_feat->f_kobj_unregister); +} + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_feat_release, +}; + /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. @@ -2542,6 +2635,368 @@ static void print_daily_error_info(unsigned long arg) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } +static void ext4_lazyinode_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + ext4_group_t group, ngroups; + struct super_block *sb; + unsigned long timeout = 0; + int ret = 0; + + sb = elr->lr_super; + ngroups = EXT4_SB(sb)->s_groups_count; + + for (group = elr->lr_next_group; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group == ngroups) + ret = 1; + + if (!ret) { + timeout = jiffies; + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + if (elr->lr_timeout == 0) { + timeout = jiffies - timeout; + if (elr->lr_sbi->s_li_wait_mult) + timeout *= elr->lr_sbi->s_li_wait_mult; + else + timeout *= 20; + elr->lr_timeout = timeout; + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request tructure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + struct ext4_sb_info *sbi; + + if (!elr) + return; + + sbi = elr->lr_sbi; + + list_del(&elr->lr_request); + sbi->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; + + if (!ext4_li_info) + return; + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(elr); + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup; + DEFINE_WAIT(wait); + + BUG_ON(NULL == eli); + + eli->li_timer.data = (unsigned long)current; + eli->li_timer.function = ext4_lazyinode_timeout; + + eli->li_task = current; + wake_up(&eli->li_wait_task); + +cont_thread: + while (true) { + next_wakeup = MAX_JIFFY_OFFSET; + + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + + list_for_each_safe(pos, n, &eli->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } + } + + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + } + mutex_unlock(&eli->li_list_mtx); + + if (freezing(current)) + refrigerator(); + + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { + cond_resched(); + continue; + } + + eli->li_timer.expires = next_wakeup; + add_timer(&eli->li_timer); + prepare_to_wait(&eli->li_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + if (time_before(jiffies, next_wakeup)) + schedule(); + finish_wait(&eli->li_wait_daemon, &wait); + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + del_timer_sync(&ext4_li_info->li_timer); + eli->li_task = NULL; + wake_up(&eli->li_wait_task); + + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + struct task_struct *t; + + t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + ext4_clear_request_list(); + del_timer_sync(&ext4_li_info->li_timer); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + + wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL); + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + eli->li_task = NULL; + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + init_waitqueue_head(&eli->li_wait_daemon); + init_waitqueue_head(&eli->li_wait_task); + init_timer(&eli->li_timer); + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + unsigned long rnd; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_sbi = sbi; + elr->lr_next_group = start; + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + get_random_bytes(&rnd, sizeof(rnd)); + elr->lr_next_sched = jiffies + (unsigned long)rnd % + (EXT4_DEF_LI_MAX_START_DELAY * HZ); + + return elr; +} + +static int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int ret = 0; + + if (sbi->s_li_request != NULL) + return 0; + + if (first_not_zeroed == ngroups || + (sb->s_flags & MS_RDONLY) || + !test_opt(sb, INIT_INODE_TABLE)) { + sbi->s_li_request = NULL; + return 0; + } + + if (first_not_zeroed == ngroups) { + sbi->s_li_request = NULL; + return 0; + } + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) + return -ENOMEM; + + mutex_lock(&ext4_li_mtx); + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info) + return; + + ext4_clear_request_list(); + + while (ext4_li_info->li_task) { + wake_up(&ext4_li_info->li_wait_daemon); + wait_event(ext4_li_info->li_wait_task, + ext4_li_info->li_task == NULL); + } +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2567,6 +3022,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2588,8 +3044,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); - unlock_kernel(); - /* Cleanup superblock name */ for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; @@ -2629,40 +3083,41 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); + set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", "2.6.38"); - set_opt(sbi->s_mount_opt, GRPID); + set_opt(sb, GRPID); } if (def_mount_opts & EXT4_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); + set_opt(sb, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); + set_opt(sb, XATTR_USER); #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); + set_opt(sb, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); + set_opt(sb, ERRORS_CONT); else - set_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sb, ERRORS_RO); if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) - set_opt(sbi->s_mount_opt, DISCARD); + set_opt(sb, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -2671,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) - set_opt(sbi->s_mount_opt, BARRIER); + set_opt(sb, BARRIER); /* * enable delayed allocation by default @@ -2679,7 +3134,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (!IS_EXT3_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) - set_opt(sbi->s_mount_opt, DELALLOC); + set_opt(sb, DELALLOC); if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, &journal_devnum, &journal_ioprio, NULL, 0)) { @@ -2831,15 +3286,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - if ((ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || - (ext4_blocks_count(es) > - (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = -EFBIG; + ret = err; goto failed_mount; } @@ -2908,7 +3362,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb)) { + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); goto failed_mount2; } @@ -2924,6 +3378,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -2965,8 +3437,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "suppressed and not mounted read-only"); goto failed_mount_wq; } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + clear_opt(sb, DATA_FLAGS); + set_opt(sb, WRITEBACK_DATA); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -3004,9 +3476,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sb, ORDERED_DATA); else - set_opt(sbi->s_mount_opt, JOURNAL_DATA); + set_opt(sb, JOURNAL_DATA); break; case EXT4_MOUNT_ORDERED_DATA: @@ -3022,22 +3494,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3099,18 +3568,18 @@ no_journal: (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " "requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DELALLOC); + clear_opt(sb, DELALLOC); } if (test_opt(sb, DIOREAD_NOLOCK)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - requested data journaling mode"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } if (sb->s_blocksize < PAGE_SIZE) { ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " "option - block size is too small"); - clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + clear_opt(sb, DIOREAD_NOLOCK); } } @@ -3129,6 +3598,10 @@ no_journal: goto failed_mount4; } + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount4; + sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, @@ -3166,7 +3639,6 @@ no_journal: if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ - lock_kernel(); kfree(orig_data); return 0; @@ -3184,10 +3656,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3195,6 +3663,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3213,7 +3685,6 @@ out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); - lock_kernel(); out_free_orig: kfree(orig_data); return ret; @@ -3306,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext4_msg(sb, KERN_ERR, - "failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -3470,7 +3934,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; ext4_clear_journal_err(sb, es); - if (journal_devnum && + if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); @@ -3524,9 +3988,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -3706,6 +4171,22 @@ static int ext4_unfreeze(struct super_block *sb) return 0; } +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; @@ -3722,12 +4203,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #endif char *orig_data = kstrdup(data, GFP_KERNEL); - lock_kernel(); - /* Store the original options */ lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; @@ -3846,6 +4326,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) enable_quota = 1; } } + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + ext4_setup_system_zone(sb); if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); @@ -3858,7 +4351,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) kfree(old_opts.s_qf_names[i]); #endif unlock_super(sb); - unlock_kernel(); if (enable_quota) dquot_resume(sb, -1); @@ -3869,6 +4361,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; @@ -3884,7 +4377,6 @@ restore_opts: } #endif unlock_super(sb); - unlock_kernel(); kfree(orig_data); return err; } @@ -4066,27 +4558,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *name) + struct path *path) { int err; - struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - err = kern_path(name, LOOKUP_FOLLOW, &path); - if (err) - return err; - /* Quotafile not on the same filesystem? */ - if (path.mnt->mnt_sb != sb) { - path_put(&path); + if (path->mnt->mnt_sb != sb) return -EXDEV; - } /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ - if (path.dentry->d_parent != sb->s_root) + if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); @@ -4097,7 +4582,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * all updates to the file when we bypass pagecache... */ if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path.dentry->d_inode)) { + ext4_should_journal_data(path->dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -4105,25 +4590,19 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) { - path_put(&path); + if (err) return err; - } } - err = dquot_quota_on_path(sb, type, format_id, &path); - path_put(&path); - return err; + return dquot_quota_on(sb, type, format_id, path); } static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } @@ -4229,17 +4708,17 @@ out: #endif -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -4284,28 +4763,58 @@ static inline void unregister_as_ext3(void) { } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; -static int __init init_ext4_fs(void) +int __init ext4_init_feat_adverts(void) +{ + struct ext4_features *ef; + int ret = -ENOMEM; + + ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); + if (!ef) + goto out; + + ef->f_kobj.kset = ext4_kset; + init_completion(&ef->f_kobj_unregister); + ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, + "features"); + if (ret) { + kfree(ef); + goto out; + } + + ext4_feat = ef; + ret = 0; +out: + return ret; +} + +static int __init ext4_init_fs(void) { int err; ext4_check_flag_values(); - err = init_ext4_system_zone(); + err = ext4_init_pageio(); if (err) return err; + err = ext4_init_system_zone(); + if (err) + goto out5; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) goto out4; ext4_proc_root = proc_mkdir("fs/ext4", NULL); - err = init_ext4_mballoc(); + + err = ext4_init_feat_adverts(); + + err = ext4_init_mballoc(); if (err) goto out3; - err = init_ext4_xattr(); + err = ext4_init_xattr(); if (err) goto out2; err = init_inodecache(); @@ -4316,38 +4825,46 @@ static int __init init_ext4_fs(void) err = register_filesystem(&ext4_fs_type); if (err) goto out; + + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); return 0; out: unregister_as_ext2(); unregister_as_ext3(); destroy_inodecache(); out1: - exit_ext4_xattr(); + ext4_exit_xattr(); out2: - exit_ext4_mballoc(); + ext4_exit_mballoc(); out3: + kfree(ext4_feat); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); out4: - exit_ext4_system_zone(); + ext4_exit_system_zone(); +out5: + ext4_exit_pageio(); return err; } -static void __exit exit_ext4_fs(void) +static void __exit ext4_exit_fs(void) { + ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - exit_ext4_xattr(); - exit_ext4_mballoc(); + ext4_exit_xattr(); + ext4_exit_mballoc(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); - exit_ext4_system_zone(); + ext4_exit_system_zone(); + ext4_exit_pageio(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); -module_init(init_ext4_fs) -module_exit(exit_ext4_fs) +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a8cd8dff1a..fc32176eee3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -427,23 +427,23 @@ cleanup: static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - int i_error, b_error; + int ret, ret2; down_read(&EXT4_I(dentry->d_inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); - if (i_error < 0) { - b_error = 0; - } else { - if (buffer) { - buffer += i_error; - buffer_size -= i_error; - } - b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); - if (b_error < 0) - i_error = 0; + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: up_read(&EXT4_I(dentry->d_inode)->xattr_sem); - return i_error + b_error; + return ret; } /* @@ -947,7 +947,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, /* * ext4_xattr_set_handle() * - * Create, replace or remove an extended attribute for this inode. Buffer + * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE @@ -1588,7 +1588,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT int __init -init_ext4_xattr(void) +ext4_init_xattr(void) { ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); if (!ext4_xattr_cache) @@ -1597,7 +1597,7 @@ init_ext4_xattr(void) } void -exit_ext4_xattr(void) +ext4_exit_xattr(void) { if (ext4_xattr_cache) mb_cache_destroy(ext4_xattr_cache); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 518e96e4390..1ef16520b95 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -83,8 +83,8 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int init_ext4_xattr(void); -extern void exit_ext4_xattr(void); +extern int __init ext4_init_xattr(void); +extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; @@ -121,14 +121,14 @@ ext4_xattr_put_super(struct super_block *sb) { } -static inline int -init_ext4_xattr(void) +static __init inline int +ext4_init_xattr(void) { return 0; } static inline void -exit_ext4_xattr(void) +ext4_exit_xattr(void) { } |