diff options
Diffstat (limited to 'fs')
464 files changed, 8784 insertions, 12212 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index a9b6301a04f..90419715c7e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -136,7 +136,8 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) } /** - * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and release it + * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and + * release it * @dentry: dentry to look for fid in * * find a fid in the dentry and then clone to a new private fid diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 147ceef8e53..c783874a9ca 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -256,7 +256,7 @@ static void v9fs_mux_poll_stop(struct v9fs_mux_data *m) vpt->muxnum--; if (!vpt->muxnum) { dprintk(DEBUG_MUX, "destroy proc %p\n", vpt); - send_sig(SIGKILL, vpt->task, 1); + kthread_stop(vpt->task); vpt->task = NULL; v9fs_mux_poll_task_num--; } @@ -438,11 +438,8 @@ static int v9fs_poll_proc(void *a) vpt = a; dprintk(DEBUG_MUX, "start %p %p\n", current, vpt); - allow_signal(SIGKILL); while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); - if (signal_pending(current)) - break; list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) { v9fs_poll_mux(m); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d9b561ba5e5..6ad6f192b6e 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -53,6 +53,8 @@ enum { Opt_uname, Opt_remotename, /* Options that take no arguments */ Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd, + /* Cache options */ + Opt_cache_loose, /* Error token */ Opt_err }; @@ -76,6 +78,8 @@ static match_table_t tokens = { {Opt_fd, "fd"}, {Opt_legacy, "noextend"}, {Opt_nodevmap, "nodevmap"}, + {Opt_cache_loose, "cache=loose"}, + {Opt_cache_loose, "loose"}, {Opt_err, NULL} }; @@ -106,6 +110,7 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) v9ses->debug = 0; v9ses->rfdno = ~0; v9ses->wfdno = ~0; + v9ses->cache = 0; if (!options) return; @@ -121,7 +126,6 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) "integer field, but no integer?\n"); continue; } - } switch (token) { case Opt_port: @@ -169,6 +173,9 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses) case Opt_nodevmap: v9ses->nodev = 1; break; + case Opt_cache_loose: + v9ses->cache = CACHE_LOOSE; + break; default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index c134d104cb2..820bf5ca35d 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -47,7 +47,7 @@ struct v9fs_session_info { unsigned int afid; /* authentication fid */ unsigned int rfdno; /* read file descriptor number */ unsigned int wfdno; /* write file descriptor number */ - + unsigned int cache; /* cache mode */ char *name; /* user name to mount as */ char *remotename; /* name of remote hierarchy being mounted */ @@ -73,6 +73,13 @@ enum { PROTO_FD, }; +/* possible values of ->cache */ +/* eventually support loose, tight, time, session, default always none */ +enum { + CACHE_NONE, /* default */ + CACHE_LOOSE, /* no consistency */ +}; + extern struct dentry *v9fs_debugfs_root; int v9fs_session_init(struct v9fs_session_info *, const char *, char *); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 450b0c1b385..8ada4c5c5d7 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,8 +40,10 @@ extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_cached_file_operations; extern const struct file_operations v9fs_dir_operations; extern struct dentry_operations v9fs_dentry_operations; +extern struct dentry_operations v9fs_cached_dentry_operations; struct inode *v9fs_get_inode(struct super_block *sb, int mode); ino_t v9fs_qid2ino(struct v9fs_qid *qid); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cc24abf232d..bed48fa9652 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -63,6 +63,8 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) int total = 0; int result = 0; + dprintk(DEBUG_VFS, "\n"); + buffer = kmap(page); do { if (count < rsize) diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 062daa6000a..ddffd8aa902 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,10 +53,31 @@ static int v9fs_dentry_delete(struct dentry *dentry) { dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + return 1; } /** + * v9fs_cached_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question + * + * Only return 1 if our inode is invalid. Only non-synthetic files + * (ones without mtime == 0) should be calling this function. + * + */ + +static int v9fs_cached_dentry_delete(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + + if(!inode) + return 1; + + return 0; +} + +/** * v9fs_dentry_release - called when dentry is going to be freed * @dentry: dentry that is being release * @@ -87,6 +108,11 @@ void v9fs_dentry_release(struct dentry *dentry) } } +struct dentry_operations v9fs_cached_dentry_operations = { + .d_delete = v9fs_cached_dentry_delete, + .d_release = v9fs_dentry_release, +}; + struct dentry_operations v9fs_dentry_operations = { .d_delete = v9fs_dentry_delete, .d_release = v9fs_dentry_release, diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 9f17b0cacdd..653dfa5b253 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -79,6 +79,13 @@ int v9fs_file_open(struct inode *inode, struct file *file) vfid->filp = file; kfree(fcall); + if((vfid->qid.version) && (v9ses->cache)) { + dprintk(DEBUG_VFS, "cached"); + /* enable cached file options */ + if(file->f_op == &v9fs_file_operations) + file->f_op = &v9fs_cached_file_operations; + } + return 0; Clunk_Fid: @@ -110,7 +117,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); - invalidate_inode_pages(&inode->i_data); + invalidate_mapping_pages(&inode->i_data, 0, -1); } return res; @@ -234,10 +241,21 @@ v9fs_file_write(struct file *filp, const char __user * data, total += result; } while (count); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(inode->i_mapping); return total; } +const struct file_operations v9fs_cached_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_mmap, +}; + const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read = v9fs_file_read, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9109ba1d696..124a085d1f2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -41,10 +41,10 @@ #include "v9fs_vfs.h" #include "fid.h" -static struct inode_operations v9fs_dir_inode_operations; -static struct inode_operations v9fs_dir_inode_operations_ext; -static struct inode_operations v9fs_file_inode_operations; -static struct inode_operations v9fs_symlink_inode_operations; +static const struct inode_operations v9fs_dir_inode_operations; +static const struct inode_operations v9fs_dir_inode_operations_ext; +static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations; /** * unixmode2p9mode - convert unix mode bits to plan 9 @@ -504,7 +504,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, goto error; } - dentry->d_op = &v9fs_dentry_operations; + if(v9ses->cache) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); if (nd && nd->flags & LOOKUP_OPEN) { @@ -585,17 +588,17 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; - goto clean_up_fids; + v9fs_fid_destroy(vfid); + goto error; } - dentry->d_op = &v9fs_dentry_operations; + if(v9ses->cache) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); return 0; -clean_up_fids: - if (vfid) - v9fs_fid_destroy(vfid); - clean_up_dfid: v9fs_fid_clunk(v9ses, dfid); @@ -629,7 +632,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); - dentry->d_op = &v9fs_dentry_operations; dirfid = v9fs_fid_lookup(dentry->d_parent); if(IS_ERR(dirfid)) @@ -700,6 +702,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, fid->qid = fcall->params.rstat.stat.qid; v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); + if((fid->qid.version)&&(v9ses->cache)) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_add(dentry, inode); kfree(fcall); @@ -1187,7 +1193,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, goto free_vfid; } - dentry->d_op = &v9fs_dentry_operations; + if(v9ses->cache) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); return 0; @@ -1306,7 +1315,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } -static struct inode_operations v9fs_dir_inode_operations_ext = { +static const struct inode_operations v9fs_dir_inode_operations_ext = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, .symlink = v9fs_vfs_symlink, @@ -1321,7 +1330,7 @@ static struct inode_operations v9fs_dir_inode_operations_ext = { .setattr = v9fs_vfs_setattr, }; -static struct inode_operations v9fs_dir_inode_operations = { +static const struct inode_operations v9fs_dir_inode_operations = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, .unlink = v9fs_vfs_unlink, @@ -1333,12 +1342,12 @@ static struct inode_operations v9fs_dir_inode_operations = { .setattr = v9fs_vfs_setattr, }; -static struct inode_operations v9fs_file_inode_operations = { +static const struct inode_operations v9fs_file_inode_operations = { .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; -static struct inode_operations v9fs_symlink_inode_operations = { +static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = v9fs_vfs_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 63320d4e15d..0ec42f66545 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -45,7 +45,7 @@ #include "fid.h" static void v9fs_clear_inode(struct inode *); -static struct super_operations v9fs_super_ops; +static const struct super_operations v9fs_super_ops; /** * v9fs_clear_inode - release an inode @@ -263,7 +263,7 @@ v9fs_umount_begin(struct vfsmount *vfsmnt, int flags) v9fs_session_cancel(v9ses); } -static struct super_operations v9fs_super_ops = { +static const struct super_operations v9fs_super_ops = { .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, .show_options = v9fs_show_options, diff --git a/fs/Kconfig b/fs/Kconfig index 8cd2417a14d..3c4886b849f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -426,7 +426,6 @@ config OCFS2_FS select CONFIGFS_FS select JBD select CRC32 - select INET help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -675,12 +674,6 @@ config ZISOFS necessary to create such a filesystem. Say Y here if you want to be able to read such compressed CD-ROMs. -config ZISOFS_FS -# for fs/nls/Config.in - tristate - depends on ZISOFS - default ISO9660_FS - config UDF_FS tristate "UDF file system support" help @@ -1095,7 +1088,7 @@ config AFFS_FS config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO && NET help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/ecryptfs.txt> to learn more about @@ -1196,32 +1189,6 @@ config EFS_FS To compile the EFS file system support as a module, choose M here: the module will be called efs. -config JFFS_FS - tristate "Journalling Flash File System (JFFS) support" - depends on MTD && BLOCK && BROKEN - help - JFFS is the Journalling Flash File System developed by Axis - Communications in Sweden, aimed at providing a crash/powerdown-safe - file system for disk-less embedded devices. Further information is - available at (<http://developer.axis.com/software/jffs/>). - - NOTE: This filesystem is deprecated and is scheduled for removal in - 2.6.21. See Documentation/feature-removal-schedule.txt - -config JFFS_FS_VERBOSE - int "JFFS debugging verbosity (0 = quiet, 3 = noisy)" - depends on JFFS_FS - default "0" - help - Determines the verbosity level of the JFFS debugging messages. - -config JFFS_PROC_FS - bool "JFFS stats available in /proc filesystem" - depends on JFFS_FS && PROC_FS - help - Enabling this option will cause statistics from mounted JFFS file systems - to be made available to the user in the /proc/fs/jffs/ directory. - config JFFS2_FS tristate "Journalling Flash File System v2 (JFFS2) support" select CRC32 @@ -1871,20 +1838,14 @@ config CIFS file servers such as Windows 2000 (including Windows 2003, NT 4 and Windows XP) as well by Samba (which provides excellent CIFS server support for Linux and many other operating systems). Limited - support for Windows ME and similar servers is provided as well. - You must use the smbfs client filesystem to access older SMB servers - such as OS/2 and DOS. + support for OS/2 and Windows ME and similar servers is provided as well. The intent of the cifs module is to provide an advanced - network file system client for mounting to CIFS compliant servers, + network file system client for mounting to CIFS compliant servers, including support for dfs (hierarchical name space), secure per-user session establishment, safe distributed caching (oplock), optional - packet signing, Unicode and other internationalization improvements, - and optional Winbind (nsswitch) integration. You do not need to enable - cifs if running only a (Samba) server. It is possible to enable both - smbfs and cifs (e.g. if you are using CIFS for accessing Windows 2003 - and Samba 3 servers, and smbfs for accessing old servers). If you need - to mount to Samba or Windows from this machine, say Y. + packet signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. config CIFS_STATS bool "CIFS statistics" @@ -1977,14 +1938,13 @@ config CIFS_EXPERIMENTAL depends on CIFS && EXPERIMENTAL help Enables cifs features under testing. These features are - experimental and currently include support for writepages - (multipage writebehind performance improvements) and directory - change notification ie fcntl(F_DNOTIFY) as well as some security - improvements. Some also depend on setting at runtime the - pseudo-file /proc/fs/cifs/Experimental (which is disabled by - default). See the file fs/cifs/README for more details. - - If unsure, say N. + experimental and currently include DFS support and directory + change notification ie fcntl(F_DNOTIFY), as well as the upcall + mechanism which will be used for Kerberos session negotiation + and uid remapping. Some of these features also may depend on + setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental + (which is disabled by default). See the file fs/cifs/README + for more details. If unsure, say N. config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" diff --git a/fs/Makefile b/fs/Makefile index b9ffa63f77f..9edf4112bee 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -94,7 +94,6 @@ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ -obj-$(CONFIG_JFFS_FS) += jffs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 29217ff36d4..936f2af39c4 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -84,7 +84,7 @@ void __adfs_error(struct super_block *sb, const char *function, */ /* dir_*.c */ -extern struct inode_operations adfs_dir_inode_operations; +extern const struct inode_operations adfs_dir_inode_operations; extern const struct file_operations adfs_dir_operations; extern struct dentry_operations adfs_dentry_operations; extern struct adfs_dir_ops adfs_f_dir_ops; @@ -93,7 +93,7 @@ extern struct adfs_dir_ops adfs_fplus_dir_ops; extern int adfs_dir_update(struct super_block *sb, struct object_info *obj); /* file.c */ -extern struct inode_operations adfs_file_inode_operations; +extern const struct inode_operations adfs_file_inode_operations; extern const struct file_operations adfs_file_operations; static inline __u32 signed_asl(__u32 val, signed int shift) diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 2b8903893d3..fc1a8dc64d7 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -295,7 +295,7 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) /* * directories can handle most operations... */ -struct inode_operations adfs_dir_inode_operations = { +const struct inode_operations adfs_dir_inode_operations = { .lookup = adfs_lookup, .setattr = adfs_notify_change, }; diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 6101ea679cb..f544a285592 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -36,6 +36,6 @@ const struct file_operations adfs_file_operations = { .sendfile = generic_file_sendfile, }; -struct inode_operations adfs_file_inode_operations = { +const struct inode_operations adfs_file_inode_operations = { .setattr = adfs_notify_change, }; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 5023351a7af..2e5f2c8371e 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -254,7 +254,7 @@ static void destroy_inodecache(void) kmem_cache_destroy(adfs_inode_cachep); } -static struct super_operations adfs_sops = { +static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .destroy_inode = adfs_destroy_inode, .write_inode = adfs_write_inode, diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 1dc8438ef38..7db2d287e9f 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -188,9 +188,9 @@ extern void affs_dir_truncate(struct inode *); /* jump tables */ -extern struct inode_operations affs_file_inode_operations; -extern struct inode_operations affs_dir_inode_operations; -extern struct inode_operations affs_symlink_inode_operations; +extern const struct inode_operations affs_file_inode_operations; +extern const struct inode_operations affs_dir_inode_operations; +extern const struct inode_operations affs_symlink_inode_operations; extern const struct file_operations affs_file_operations; extern const struct file_operations affs_file_operations_ofs; extern const struct file_operations affs_dir_operations; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index cad3ee34006..6e3f282424b 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -26,7 +26,7 @@ const struct file_operations affs_dir_operations = { /* * directories can handle most operations... */ -struct inode_operations affs_dir_inode_operations = { +const struct inode_operations affs_dir_inode_operations = { .create = affs_create, .lookup = affs_lookup, .link = affs_link, diff --git a/fs/affs/file.c b/fs/affs/file.c index 05b5e22de75..4aa8079e71b 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -38,7 +38,7 @@ const struct file_operations affs_file_operations = { .sendfile = generic_file_sendfile, }; -struct inode_operations affs_file_inode_operations = { +const struct inode_operations affs_file_inode_operations = { .truncate = affs_truncate, .setattr = affs_notify_change, }; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 44d439cb69f..fce6848a464 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -12,7 +12,7 @@ #include "affs.h" -extern struct inode_operations affs_symlink_inode_operations; +extern const struct inode_operations affs_symlink_inode_operations; extern struct timezone sys_tz; void diff --git a/fs/affs/super.c b/fs/affs/super.c index 3de93e79994..a324045d855 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -112,7 +112,7 @@ static void destroy_inodecache(void) kmem_cache_destroy(affs_inode_cachep); } -static struct super_operations affs_sops = { +static const struct super_operations affs_sops = { .alloc_inode = affs_alloc_inode, .destroy_inode = affs_destroy_inode, .read_inode = affs_read_inode, diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index f802256a593..41782539c90 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -70,7 +70,7 @@ const struct address_space_operations affs_symlink_aops = { .readpage = affs_symlink_readpage, }; -struct inode_operations affs_symlink_inode_operations = { +const struct inode_operations affs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/afs/cell.c b/fs/afs/cell.c index bfc1fd22d5b..1fc57837275 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -10,7 +10,6 @@ */ #include <linux/module.h> -#include <linux/sched.h> #include <linux/slab.h> #include <rxrpc/peer.h> #include <rxrpc/connection.h> diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4acd0413405..b6dc2ebe47a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -37,7 +36,7 @@ const struct file_operations afs_dir_file_operations = { .readdir = afs_dir_readdir, }; -struct inode_operations afs_dir_inode_operations = { +const struct inode_operations afs_dir_inode_operations = { .lookup = afs_dir_lookup, .getattr = afs_inode_getattr, #if 0 /* TODO */ diff --git a/fs/afs/file.c b/fs/afs/file.c index 2e8c42639ea..b17634541f6 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -30,7 +29,7 @@ static int afs_file_readpage(struct file *file, struct page *page); static void afs_file_invalidatepage(struct page *page, unsigned long offset); static int afs_file_releasepage(struct page *page, gfp_t gfp_flags); -struct inode_operations afs_file_inode_operations = { +const struct inode_operations afs_file_inode_operations = { .getattr = afs_inode_getattr, }; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6f37754906c..9d9bca6c28b 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -16,7 +16,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e88b3b65ae4..5151d5da2c2 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -63,14 +63,14 @@ extern struct cachefs_index_def afs_cache_cell_index_def; /* * dir.c */ -extern struct inode_operations afs_dir_inode_operations; +extern const struct inode_operations afs_dir_inode_operations; extern const struct file_operations afs_dir_file_operations; /* * file.c */ extern const struct address_space_operations afs_fs_aops; -extern struct inode_operations afs_file_inode_operations; +extern const struct inode_operations afs_file_inode_operations; #ifdef AFS_CACHING_SUPPORT extern int afs_cache_get_page_cookie(struct page *page, @@ -104,7 +104,7 @@ extern struct cachefs_netfs afs_cache_netfs; /* * mntpt.c */ -extern struct inode_operations afs_mntpt_inode_operations; +extern const struct inode_operations afs_mntpt_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct afs_timer afs_mntpt_expiry_timer; extern struct afs_timer_ops afs_mntpt_expiry_timer_ops; diff --git a/fs/afs/main.c b/fs/afs/main.c index 913c689bdb3..f2704ba5385 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/completion.h> #include <rxrpc/rxrpc.h> #include <rxrpc/transport.h> diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 8f74e845082..68495f0de7b 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -36,7 +35,7 @@ const struct file_operations afs_mntpt_file_operations = { .open = afs_mntpt_open, }; -struct inode_operations afs_mntpt_inode_operations = { +const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, .follow_link = afs_mntpt_follow_link, .readlink = page_readlink, diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 86463ec9ccb..ae6b85b1e48 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -9,7 +9,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/proc_fs.h> diff --git a/fs/afs/super.c b/fs/afs/super.c index 18d9b77ba40..eb7e32349da 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -56,7 +56,7 @@ struct file_system_type afs_fs_type = { .fs_flags = FS_BINARY_MOUNTDATA, }; -static struct super_operations afs_super_ops = { +static const struct super_operations afs_super_ops = { .statfs = simple_statfs, .alloc_inode = afs_alloc_inode, .drop_inode = generic_delete_inode, @@ -132,7 +132,7 @@ static int aio_setup_ring(struct kioctx *ctx) dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, + PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); @@ -211,11 +211,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if ((unsigned long)nr_events > aio_max_nr) return ERR_PTR(-EAGAIN); - ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); + ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); - memset(ctx, 0, sizeof(*ctx)); ctx->max_reqs = nr_events; mm = ctx->mm = current->mm; atomic_inc(&mm->mm_count); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 906ba5ce226..4ef544434b5 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -142,8 +142,8 @@ struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info /* Operations structures */ -extern struct inode_operations autofs_root_inode_operations; -extern struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_root_inode_operations; +extern const struct inode_operations autofs_symlink_inode_operations; extern const struct file_operations autofs_root_operations; /* Initializing function */ diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index f968d134280..aa0b61ff827 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -52,7 +52,7 @@ out_kill_sb: static void autofs_read_inode(struct inode *inode); -static struct super_operations autofs_sops = { +static const struct super_operations autofs_sops = { .read_inode = autofs_read_inode, .statfs = simple_statfs, }; diff --git a/fs/autofs/root.c b/fs/autofs/root.c index e698c51d2b0..f2597205939 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -32,7 +32,7 @@ const struct file_operations autofs_root_operations = { .ioctl = autofs_root_ioctl, }; -struct inode_operations autofs_root_inode_operations = { +const struct inode_operations autofs_root_inode_operations = { .lookup = autofs_root_lookup, .unlink = autofs_root_unlink, .symlink = autofs_root_symlink, diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c index c74f2eb6577..7ce9cb2c9ce 100644 --- a/fs/autofs/symlink.c +++ b/fs/autofs/symlink.c @@ -20,7 +20,7 @@ static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations autofs_symlink_inode_operations = { +const struct inode_operations autofs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = autofs_follow_link }; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 216b1a364cc..6b4cec3f272 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -168,11 +168,11 @@ int autofs4_expire_multi(struct super_block *, struct vfsmount *, /* Operations structures */ -extern struct inode_operations autofs4_symlink_inode_operations; -extern struct inode_operations autofs4_dir_inode_operations; -extern struct inode_operations autofs4_root_inode_operations; -extern struct inode_operations autofs4_indirect_root_inode_operations; -extern struct inode_operations autofs4_direct_root_inode_operations; +extern const struct inode_operations autofs4_symlink_inode_operations; +extern const struct inode_operations autofs4_dir_inode_operations; +extern const struct inode_operations autofs4_root_inode_operations; +extern const struct inode_operations autofs4_indirect_root_inode_operations; +extern const struct inode_operations autofs4_direct_root_inode_operations; extern const struct file_operations autofs4_dir_operations; extern const struct file_operations autofs4_root_operations; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index e8f6c5ad3e9..5e458e096ef 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -196,7 +196,7 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -static struct super_operations autofs4_sops = { +static const struct super_operations autofs4_sops = { .statfs = simple_statfs, .show_options = autofs4_show_options, }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 8d05b9f7578..47fee96c218 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -47,7 +47,7 @@ const struct file_operations autofs4_dir_operations = { .readdir = autofs4_dir_readdir, }; -struct inode_operations autofs4_indirect_root_inode_operations = { +const struct inode_operations autofs4_indirect_root_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .symlink = autofs4_dir_symlink, @@ -55,7 +55,7 @@ struct inode_operations autofs4_indirect_root_inode_operations = { .rmdir = autofs4_dir_rmdir, }; -struct inode_operations autofs4_direct_root_inode_operations = { +const struct inode_operations autofs4_direct_root_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .mkdir = autofs4_dir_mkdir, @@ -63,7 +63,7 @@ struct inode_operations autofs4_direct_root_inode_operations = { .follow_link = autofs4_follow_link, }; -struct inode_operations autofs4_dir_inode_operations = { +const struct inode_operations autofs4_dir_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .symlink = autofs4_dir_symlink, diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 2ea2c98fd84..b4ea82934d2 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -19,7 +19,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations autofs4_symlink_inode_operations = { +const struct inode_operations autofs4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = autofs4_follow_link }; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 869f5193ecc..efeab2fab40 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -291,7 +291,7 @@ static int bad_inode_removexattr(struct dentry *dentry, const char *name) return -EIO; } -static struct inode_operations bad_inode_ops = +static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 481e59b9d91..cc6cc8ed2e3 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -68,7 +68,7 @@ static const struct file_operations befs_dir_operations = { .readdir = befs_readdir, }; -static struct inode_operations befs_dir_inode_operations = { +static const struct inode_operations befs_dir_inode_operations = { .lookup = befs_lookup, }; @@ -78,7 +78,7 @@ static const struct address_space_operations befs_aops = { .bmap = befs_bmap, }; -static struct inode_operations befs_symlink_inode_operations = { +static const struct inode_operations befs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = befs_follow_link, .put_link = befs_put_link, diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index 31973bbbf05..130f6c66c5b 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -48,12 +48,12 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode) /* file.c */ -extern struct inode_operations bfs_file_inops; +extern const struct inode_operations bfs_file_inops; extern const struct file_operations bfs_file_operations; extern const struct address_space_operations bfs_aops; /* dir.c */ -extern struct inode_operations bfs_dir_inops; +extern const struct inode_operations bfs_dir_inops; extern const struct file_operations bfs_dir_operations; #endif /* _FS_BFS_BFS_H */ diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 2a746e688df..097f1497f74 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -260,7 +260,7 @@ end_rename: return error; } -struct inode_operations bfs_dir_inops = { +const struct inode_operations bfs_dir_inops = { .create = bfs_create, .lookup = bfs_lookup, .link = bfs_link, diff --git a/fs/bfs/file.c b/fs/bfs/file.c index a9164a87f8d..ef4d1fa04e6 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -164,4 +164,4 @@ const struct address_space_operations bfs_aops = { .bmap = bfs_bmap, }; -struct inode_operations bfs_file_inops; +const struct inode_operations bfs_file_inops; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 134c99941a6..93d6219243a 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static void destroy_inodecache(void) kmem_cache_destroy(bfs_inode_cachep); } -static struct super_operations bfs_sops = { +static const struct super_operations bfs_sops = { .alloc_inode = bfs_alloc_inode, .destroy_inode = bfs_destroy_inode, .read_inode = bfs_read_inode, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 669dbe5b031..51db1182b27 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -76,7 +76,8 @@ static struct linux_binfmt elf_format = { .load_binary = load_elf_binary, .load_shlib = load_elf_library, .core_dump = elf_core_dump, - .min_coredump = ELF_EXEC_PAGESIZE + .min_coredump = ELF_EXEC_PAGESIZE, + .hasvdso = 1 }; #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a4d933a5120..5810aa1339f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -372,7 +372,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, down_write(¤t->mm->mmap_sem); current->mm->start_brk = do_mmap(NULL, 0, stack_size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index ae8595d4985..7b0265d7f3a 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -419,7 +419,7 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long memp = 0; /* for finding the brk area */ + unsigned long len, reallen, memp = 0; unsigned long extra, rlim; unsigned long *reloc = 0, *rp; struct inode *inode; @@ -540,10 +540,18 @@ static int load_flat_file(struct linux_binprm * bprm, goto err; } + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); down_write(¤t->mm->mmap_sem); - realdatastart = do_mmap(0, 0, data_len + extra + - MAX_SHARED_LIBS * sizeof(unsigned long), - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + realdatastart = do_mmap(0, 0, len, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); + /* Remap to use all availabe slack region space */ + if (realdatastart && (realdatastart < (unsigned long)-4096)) { + reallen = ksize(realdatastart); + if (reallen > len) { + realdatastart = do_mremap(realdatastart, len, + reallen, MREMAP_FIXED, realdatastart); + } + } up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { @@ -584,11 +592,20 @@ static int load_flat_file(struct linux_binprm * bprm, } else { + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); down_write(¤t->mm->mmap_sem); - textpos = do_mmap(0, 0, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(unsigned long), - PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + textpos = do_mmap(0, 0, len, + PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); + /* Remap to use all availabe slack region space */ + if (textpos && (textpos < (unsigned long) -4096)) { + reallen = ksize(textpos); + if (reallen > len) { + textpos = do_mremap(textpos, len, reallen, + MREMAP_FIXED, textpos); + } + } up_write(¤t->mm->mmap_sem); + if (!textpos || textpos >= (unsigned long) -4096) { if (!textpos) textpos = (unsigned long) -ENOMEM; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c2e08252af3..e6f57990b12 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -719,7 +719,7 @@ static const struct file_operations bm_status_operations = { /* Superblock handling */ -static struct super_operations s_ops = { +static const struct super_operations s_ops = { .statfs = simple_statfs, .clear_inode = bm_clear_inode, }; diff --git a/fs/block_dev.c b/fs/block_dev.c index fc7028b685f..0c59b703e9d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -489,7 +489,7 @@ static void bdev_clear_inode(struct inode *inode) spin_unlock(&bdev_lock); } -static struct super_operations bdev_sops = { +static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .destroy_inode = bdev_destroy_inode, diff --git a/fs/buffer.c b/fs/buffer.c index 1ad674fd348..f99c509697c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(__lock_buffer); void fastcall unlock_buffer(struct buffer_head *bh) { + smp_mb__before_clear_bit(); clear_buffer_locked(bh); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); @@ -345,7 +346,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) * We really want to use invalidate_inode_pages2() for * that, but not until that's cleaned up. */ - invalidate_inode_pages(mapping); + invalidate_mapping_pages(mapping, 0, -1); } /* @@ -1282,11 +1283,11 @@ static void bh_lru_install(struct buffer_head *bh) * Look up the bh in this cpu's LRU. If it's there, move it to the head. */ static struct buffer_head * -lookup_bh_lru(struct block_device *bdev, sector_t block, int size) +lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *ret = NULL; struct bh_lru *lru; - int i; + unsigned int i; check_irqs_on(); bh_lru_lock(); @@ -1318,7 +1319,7 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, int size) * NULL */ struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, int size) +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); @@ -1346,7 +1347,7 @@ EXPORT_SYMBOL(__find_get_block); * attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, int size) +__getblk(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __find_get_block(bdev, block, size); @@ -1360,7 +1361,7 @@ EXPORT_SYMBOL(__getblk); /* * Do async read-ahead on a buffer.. */ -void __breadahead(struct block_device *bdev, sector_t block, int size) +void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { @@ -1380,7 +1381,7 @@ EXPORT_SYMBOL(__breadahead); * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, int size) +__bread(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); @@ -1439,6 +1440,7 @@ static void discard_buffer(struct buffer_head * bh) clear_buffer_req(bh); clear_buffer_new(bh); clear_buffer_delay(bh); + clear_buffer_unwritten(bh); unlock_buffer(bh); } @@ -1822,6 +1824,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page, continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; @@ -2543,7 +2546,7 @@ int block_truncate_page(struct address_space *mapping, if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh) && !buffer_delay(bh)) { + if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = -EIO; ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); diff --git a/fs/char_dev.c b/fs/char_dev.c index a885f46ca00..e6194e2b9bb 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -108,6 +108,13 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, /* temporary */ if (major == 0) { for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { + /* + * Disallow the LANANA-assigned LOCAL/EXPERIMENTAL + * majors + */ + if ((60 <= i && i <= 63) || (120 <= i && i <= 127) || + (240 <= i && i <= 254)) + continue; if (chrdevs[i] == NULL) break; } diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index d04d2f7448d..5fe13593b57 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,6 +1,15 @@ Version 1.47 ------------ Fix oops in list_del during mount caused by unaligned string. +Fix file corruption which could occur on some large file +copies caused by writepages page i/o completion bug. +Seek to SEEK_END forces check for update of file size for non-cached +files. Allow file size to be updated on remote extend of locally open, +non-cached file. Fix reconnect to newer Samba servers (or other servers +which support the CIFS Unix/POSIX extensions) so that we again tell the +server the Unix/POSIX cifs capabilities which we support (SetFSInfo). +Add experimental support for new POSIX Open/Mkdir (which returns +stat information on the open, and allows setting the mode). Version 1.46 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 432e515431c..080c5eba112 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -1,5 +1,5 @@ The CIFS VFS support for Linux supports many advanced network filesystem -features such as heirarchical dfs like namespace, hardlinks, locking and more. +features such as hierarchical dfs like namespace, hardlinks, locking and more. It was designed to comply with the SNIA CIFS Technical Reference (which supersedes the 1992 X/Open SMB Standard) as well as to perform best practice practical interoperability with Windows 2000, Windows XP, Samba and equivalent diff --git a/fs/cifs/TODO b/fs/cifs/TODO index fc34c74ec4b..68372946dc9 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -128,3 +128,11 @@ negotiated size) and send larger write sizes to modern servers. 4) More exhaustively test against less common servers. More testing against Windows 9x, Windows ME servers. + +DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) + +mount check for unmatched uids - and uid override + +Add mount option for Linux extension disable per mount, and partial disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?) + +Free threads at umount --force that are stuck on the sesSem diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 10c90294cd1..e8287c4c6eb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,7 +64,7 @@ extern struct task_struct * oplockThread; /* remove sparse warning */ struct task_struct * oplockThread = NULL; extern struct task_struct * dnotifyThread; /* remove sparse warning */ struct task_struct * dnotifyThread = NULL; -static struct super_operations cifs_super_ops; +static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, int, 0); MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048"); @@ -453,7 +453,7 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations cifs_super_ops = { +static const struct super_operations cifs_super_ops = { .read_inode = cifs_read_inode, .put_super = cifs_put_super, .statfs = cifs_statfs, @@ -511,7 +511,15 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) { /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { - int retval = cifs_revalidate(file->f_path.dentry); + int retval; + + /* some applications poll for the file length in this strange + way so we must seek to end on non-oplocked files by + setting the revalidate time to zero */ + if(file->f_path.dentry->d_inode) + CIFS_I(file->f_path.dentry->d_inode)->time = 0; + + retval = cifs_revalidate(file->f_path.dentry); if (retval < 0) return (loff_t)retval; } @@ -525,7 +533,7 @@ static struct file_system_type cifs_fs_type = { .kill_sb = kill_anon_super, /* .fs_flags */ }; -struct inode_operations cifs_dir_inode_ops = { +const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .lookup = cifs_lookup, .getattr = cifs_getattr, @@ -547,7 +555,7 @@ struct inode_operations cifs_dir_inode_ops = { #endif }; -struct inode_operations cifs_file_inode_ops = { +const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ @@ -561,7 +569,7 @@ struct inode_operations cifs_file_inode_ops = { #endif }; -struct inode_operations cifs_symlink_inode_ops = { +const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, .follow_link = cifs_follow_link, .put_link = cifs_put_link, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 8aa66dcf13b..c97c08eb481 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -36,13 +36,13 @@ extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; /* Functions related to super block operations */ -/* extern struct super_operations cifs_super_ops;*/ +/* extern const struct super_operations cifs_super_ops;*/ extern void cifs_read_inode(struct inode *); extern void cifs_delete_inode(struct inode *); /* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */ /* Functions related to inodes */ -extern struct inode_operations cifs_dir_inode_ops; +extern const struct inode_operations cifs_dir_inode_ops; extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *); extern struct dentry * cifs_lookup(struct inode *, struct dentry *, @@ -58,8 +58,8 @@ extern int cifs_revalidate(struct dentry *); extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int cifs_setattr(struct dentry *, struct iattr *); -extern struct inode_operations cifs_file_inode_ops; -extern struct inode_operations cifs_symlink_inode_ops; +extern const struct inode_operations cifs_file_inode_ops; +extern const struct inode_operations cifs_symlink_inode_ops; /* Functions related to files and directories */ extern const struct file_operations cifs_file_ops; @@ -100,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.47" +#define CIFS_VERSION "1.48" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 068ef51edbf..7d9505491b1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1,7 +1,7 @@ /* * fs/cifs/cifspdu.h * - * Copyright (c) International Business Machines Corp., 2002,2005 + * Copyright (c) International Business Machines Corp., 2002,2007 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -544,7 +544,8 @@ typedef union smb_com_session_setup_andx { /* unsigned char * NativeOS; */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ - } __attribute__((packed)) resp; /* NTLM response with or without extended sec*/ + } __attribute__((packed)) resp; /* NTLM response + (with or without extended sec) */ struct { /* request format */ struct smb_hdr hdr; /* wct = 10 */ @@ -795,6 +796,8 @@ typedef struct smb_com_openx_rsp { __u16 ByteCount; } __attribute__((packed)) OPENX_RSP; +/* For encoding of POSIX Open Request - see trans2 function 0x209 data struct */ + /* Legacy write request for older servers */ typedef struct smb_com_writex_req { struct smb_hdr hdr; /* wct = 12 */ @@ -1352,11 +1355,13 @@ struct smb_t2_rsp { #define SMB_QUERY_FILE_UNIX_BASIC 0x200 #define SMB_QUERY_FILE_UNIX_LINK 0x201 #define SMB_QUERY_POSIX_ACL 0x204 -#define SMB_QUERY_XATTR 0x205 +#define SMB_QUERY_XATTR 0x205 /* e.g. system EA name space */ #define SMB_QUERY_ATTR_FLAGS 0x206 /* append,immutable etc. */ #define SMB_QUERY_POSIX_PERMISSION 0x207 #define SMB_QUERY_POSIX_LOCK 0x208 -/* #define SMB_POSIX_OPEN 0x209 */ +/* #define SMB_POSIX_OPEN 0x209 */ +/* #define SMB_POSIX_UNLINK 0x20a */ +#define SMB_QUERY_FILE__UNIX_INFO2 0x20b #define SMB_QUERY_FILE_INTERNAL_INFO 0x3ee #define SMB_QUERY_FILE_ACCESS_INFO 0x3f0 #define SMB_QUERY_FILE_NAME_INFO2 0x3f1 /* 0x30 bytes */ @@ -1377,8 +1382,10 @@ struct smb_t2_rsp { #define SMB_SET_ATTR_FLAGS 0x206 /* append, immutable etc. */ #define SMB_SET_POSIX_LOCK 0x208 #define SMB_POSIX_OPEN 0x209 +#define SMB_POSIX_UNLINK 0x20a +#define SMB_SET_FILE_UNIX_INFO2 #define SMB_SET_FILE_BASIC_INFO2 0x3ec -#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo level too */ +#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */ #define SMB_FILE_ALL_INFO2 0x3fa #define SMB_SET_FILE_ALLOCATION_INFO2 0x3fb #define SMB_SET_FILE_END_OF_FILE_INFO2 0x3fc @@ -1428,7 +1435,7 @@ typedef struct smb_com_transaction2_qpi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ } __attribute__((packed)) TRANSACTION2_QPI_RSP; typedef struct smb_com_transaction2_spi_req { @@ -1461,7 +1468,7 @@ typedef struct smb_com_transaction2_spi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */ + __u16 Reserved2; /* parameter word is present for infolevels > 100 */ } __attribute__((packed)) TRANSACTION2_SPI_RSP; struct set_file_rename { @@ -1627,6 +1634,7 @@ typedef struct smb_com_transaction2_fnext_rsp_parms { #define SMB_QUERY_FS_ATTRIBUTE_INFO 0x105 #define SMB_QUERY_CIFS_UNIX_INFO 0x200 #define SMB_QUERY_POSIX_FS_INFO 0x201 +#define SMB_QUERY_POSIX_WHO_AM_I 0x202 #define SMB_QUERY_LABEL_INFO 0x3ea #define SMB_QUERY_FS_QUOTA_INFO 0x3ee #define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef @@ -1659,9 +1667,21 @@ typedef struct smb_com_transaction_qfsi_rsp { struct smb_hdr hdr; /* wct = 10 + SetupCount */ struct trans2_resp t2; __u16 ByteCount; - __u8 Pad; /* may be three bytes *//* followed by data area */ + __u8 Pad; /* may be three bytes? *//* followed by data area */ } __attribute__((packed)) TRANSACTION2_QFSI_RSP; +typedef struct whoami_rsp_data { /* Query level 0x202 */ + __u32 flags; /* 0 = Authenticated user 1 = GUEST */ + __u32 mask; /* which flags bits server understands ie 0x0001 */ + __u64 unix_user_id; + __u64 unix_user_gid; + __u32 number_of_supplementary_gids; /* may be zero */ + __u32 number_of_sids; /* may be zero */ + __u32 length_of_sid_array; /* in bytes - may be zero */ + __u32 pad; /* reserved - MBZ */ + /* __u64 gid_array[0]; */ /* may be empty */ + /* __u8 * psid_list */ /* may be empty */ +} __attribute__((packed)) WHOAMI_RSP_DATA; /* SETFSInfo Levels */ #define SMB_SET_CIFS_UNIX_INFO 0x200 @@ -1858,8 +1878,11 @@ typedef struct { #define CIFS_UNIX_XATTR_CAP 0x00000004 /* support new namespace */ #define CIFS_UNIX_EXTATTR_CAP 0x00000008 /* support chattr/chflag */ #define CIFS_UNIX_POSIX_PATHNAMES_CAP 0x00000010 /* Allow POSIX path chars */ +#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based + calls including posix open + and posix unlink */ #ifdef CONFIG_CIFS_POSIX -#define CIFS_UNIX_CAP_MASK 0x0000001b +#define CIFS_UNIX_CAP_MASK 0x0000003b #else #define CIFS_UNIX_CAP_MASK 0x00000013 #endif /* CONFIG_CIFS_POSIX */ @@ -1946,7 +1969,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __le32 AlignmentRequirement; __le32 FileNameLength; char FileName[1]; -} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ +} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ /* defines for enumerating possible values of the Unix type field below */ #define UNIX_FILE 0 @@ -1970,11 +1993,11 @@ typedef struct { __u64 UniqueId; __le64 Permissions; __le64 Nlinks; -} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ +} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ typedef struct { char LinkDest[1]; -} __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ +} __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ /* The following three structures are needed only for setting time to NT4 and some older servers via @@ -2011,7 +2034,7 @@ typedef struct { __le64 ChangeTime; __le32 Attributes; __u32 Pad; -} __attribute__((packed)) FILE_BASIC_INFO; /* size info, level 0x101 */ +} __attribute__((packed)) FILE_BASIC_INFO; /* size info, level 0x101 */ struct file_allocation_info { __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ @@ -2020,7 +2043,7 @@ struct file_allocation_info { struct file_end_of_file_info { __le64 FileSize; /* offset to end of file */ -} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ +} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ struct file_alt_name_info { __u8 alt_name[1]; @@ -2075,6 +2098,19 @@ struct cifs_posix_acl { /* access conrol list (ACL) */ /* end of POSIX ACL definitions */ +typedef struct { + __u32 OpenFlags; /* same as NT CreateX */ + __u32 PosixOpenFlags; + __u32 Mode; + __u16 Level; /* reply level requested (see QPathInfo levels) */ + __u16 Pad; /* reserved - MBZ */ +} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */ + +typedef struct { + /* reply varies based on requested level */ +} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */ + + struct file_internal_info { __u64 UniqueId; /* inode number */ } __attribute__((packed)); /* level 0x3ee */ @@ -2238,7 +2274,8 @@ struct data_blob { 1) PosixCreateX - to set and return the mode, inode#, device info and perhaps add a CreateDevice - to create Pipes and other special .inodes Also note POSIX open flags - 2) Close - to return the last write time to do cache across close more safely + 2) Close - to return the last write time to do cache across close + more safely 3) FindFirst return unique inode number - what about resume key, two forms short (matches readdir) and full (enough info to cache inodes) 4) Mkdir - set mode @@ -2273,7 +2310,8 @@ struct data_blob { TRANSACTION2 (18 cases) SMB_SET_FILE_END_OF_FILE_INFO2 SMB_SET_PATH_END_OF_FILE_INFO2 (BB verify that never need to set allocation size) - SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via Unix ext?) + SMB_SET_FILE_BASIC_INFO2 (setting times - BB can it be done via + Unix ext?) COPY (note support for copy across directories) - FUTURE, OPTIONAL setting/getting OS/2 EAs - FUTURE (BB can this handle @@ -2293,13 +2331,13 @@ struct data_blob { T2 QUERY_PATH_INFO (SMB_QUERY_FILE_UNIX_BASIC) - BB check for missing inode fields Actually need QUERY_FILE_UNIX_INFO since has inode num BB what about a) blksize/blkbits/blocks - b) i_version - c) i_rdev - d) notify mask? - e) generation - f) size_seqcount + b) i_version + c) i_rdev + d) notify mask? + e) generation + f) size_seqcount T2 FIND_FIRST/FIND_NEXT FIND_FILE_UNIX - TRANS2_GET_DFS_REFERRAL - OPTIONAL but recommended + TRANS2_GET_DFS_REFERRAL - OPTIONAL but recommended T2_QFS_INFO QueryDevice/AttributeInfo - OPTIONAL @@ -2338,7 +2376,7 @@ typedef struct file_xattr_info { __u32 xattr_value_len; char xattr_name[0]; /* followed by xattr_value[xattr_value_len], no pad */ -} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute, info level 0x205 */ +} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute, info level 0x205 */ /* flags for chattr command */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f1f8225102f..6148b82170c 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -23,6 +23,7 @@ #include <linux/nls.h> struct statfs; +struct smb_vol; /* ***************************************************************** @@ -57,7 +58,7 @@ extern int SendReceiveBlockingLock(const unsigned int /* xid */ , int * /* bytes returned */); extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); -extern int is_size_safe_to_change(struct cifsInodeInfo *); +extern int is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); @@ -147,6 +148,8 @@ extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, unsigned int *pnum_referrals, unsigned char ** preferrals, int remap); +extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon, + struct super_block * sb, struct smb_vol * vol); extern int CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData); extern int SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 472e33e0f3c..24364106b8f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -158,9 +158,15 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, nls_codepage); if(!rc && (tcon->tidStatus == CifsNeedReconnect)) { mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon - , nls_codepage); + rc = CIFSTCon(0, tcon->ses, tcon->treeName, + tcon, nls_codepage); up(&tcon->ses->sesSem); + /* tell server which Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0 /* no xid */, + tcon, + NULL /* we do not know sb */, + NULL /* no vol info */); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ @@ -298,6 +304,12 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); up(&tcon->ses->sesSem); + /* tell server which Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0 /* no xid */, + tcon, + NULL /* do not know sb */, + NULL /* no vol info */); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ @@ -2812,10 +2824,10 @@ GetExtAttrOut: /* security id for everyone */ -const static struct cifs_sid sid_everyone = +static const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}}; /* group users */ -const static struct cifs_sid sid_user = +static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}}; /* Convert CIFS ACL to POSIX form */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2caca06b4ba..20ba7dcc995 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1613,6 +1613,76 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket) return rc; } +void reset_cifs_unix_caps(int xid, struct cifsTconInfo * tcon, + struct super_block * sb, struct smb_vol * vol_info) +{ + /* if we are reconnecting then should we check to see if + * any requested capabilities changed locally e.g. via + * remount but we can not do much about it here + * if they have (even if we could detect it by the following) + * Perhaps we could add a backpointer to array of sb from tcon + * or if we change to make all sb to same share the same + * sb as NFS - then we only have one backpointer to sb. + * What if we wanted to mount the server share twice once with + * and once without posixacls or posix paths? */ + __u64 saved_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + + + if(!CIFSSMBQFSUnixInfo(xid, tcon)) { + __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + + /* check for reconnect case in which we do not + want to change the mount behavior if we can avoid it */ + if(vol_info == NULL) { + /* turn off POSIX ACL and PATHNAMES if not set + originally at mount time */ + if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + + + + + } + + cap &= CIFS_UNIX_CAP_MASK; + if(vol_info && vol_info->no_psx_acl) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + else if(CIFS_UNIX_POSIX_ACL_CAP & cap) { + cFYI(1,("negotiated posix acl support")); + if(sb) + sb->s_flags |= MS_POSIXACL; + } + + if(vol_info && vol_info->posix_paths == 0) + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + else if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { + cFYI(1,("negotiate posix pathnames")); + if(sb) + CIFS_SB(sb)->mnt_cifs_flags |= + CIFS_MOUNT_POSIX_PATHS; + } + + cFYI(1,("Negotiate caps 0x%x",(int)cap)); +#ifdef CONFIG_CIFS_DEBUG2 + if(cap & CIFS_UNIX_FCNTL_CAP) + cFYI(1,("FCNTL cap")); + if(cap & CIFS_UNIX_EXTATTR_CAP) + cFYI(1,("EXTATTR cap")); + if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) + cFYI(1,("POSIX path cap")); + if(cap & CIFS_UNIX_XATTR_CAP) + cFYI(1,("XATTR cap")); + if(cap & CIFS_UNIX_POSIX_ACL_CAP) + cFYI(1,("POSIX ACL cap")); +#endif /* CIFS_DEBUG2 */ + if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { + cFYI(1,("setting capabilities failed")); + } + } +} + int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data, const char *devname) @@ -1928,20 +1998,25 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, if (tcon == NULL) rc = -ENOMEM; else { - /* check for null share name ie connect to dfs root */ + /* check for null share name ie connecting to + * dfs root */ - /* BB check if this works for exactly length three strings */ + /* BB check if this works for exactly length + * three strings */ if ((strchr(volume_info.UNC + 3, '\\') == NULL) && (strchr(volume_info.UNC + 3, '/') == NULL)) { rc = connect_to_dfs_path(xid, pSesInfo, - "", cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + "", cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); kfree(volume_info.UNC); FreeXid(xid); return -ENODEV; } else { + /* BB Do we need to wrap sesSem around + * this TCon call and Unix SetFS as + * we do on SessSetup and reconnect? */ rc = CIFSTCon(xid, pSesInfo, volume_info.UNC, tcon, cifs_sb->local_nls); @@ -1962,6 +2037,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ } + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ sb->s_time_gran = 100; /* on error free sesinfo and tcon struct if needed */ @@ -2006,45 +2082,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, /* do not care if following two calls succeed - informational */ CIFSSMBQFSDeviceInfo(xid, tcon); CIFSSMBQFSAttributeInfo(xid, tcon); - - if (tcon->ses->capabilities & CAP_UNIX) { - if(!CIFSSMBQFSUnixInfo(xid, tcon)) { - __u64 cap = - le64_to_cpu(tcon->fsUnixInfo.Capability); - cap &= CIFS_UNIX_CAP_MASK; - if(volume_info.no_psx_acl) - cap &= ~CIFS_UNIX_POSIX_ACL_CAP; - else if(CIFS_UNIX_POSIX_ACL_CAP & cap) { - cFYI(1,("negotiated posix acl support")); - sb->s_flags |= MS_POSIXACL; - } - - if(volume_info.posix_paths == 0) - cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; - else if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { - cFYI(1,("negotiate posix pathnames")); - cifs_sb->mnt_cifs_flags |= - CIFS_MOUNT_POSIX_PATHS; - } - - cFYI(1,("Negotiate caps 0x%x",(int)cap)); -#ifdef CONFIG_CIFS_DEBUG2 - if(cap & CIFS_UNIX_FCNTL_CAP) - cFYI(1,("FCNTL cap")); - if(cap & CIFS_UNIX_EXTATTR_CAP) - cFYI(1,("EXTATTR cap")); - if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) - cFYI(1,("POSIX path cap")); - if(cap & CIFS_UNIX_XATTR_CAP) - cFYI(1,("XATTR cap")); - if(cap & CIFS_UNIX_POSIX_ACL_CAP) - cFYI(1,("POSIX ACL cap")); -#endif /* CIFS_DEBUG2 */ - if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { - cFYI(1,("setting capabilities failed")); - } - } - } + + /* tell server which Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(xid, tcon, sb, &volume_info); + if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X)) cifs_sb->wsize = min(cifs_sb->wsize, (tcon->ses->server->maxBuf - diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8a49b2e77d3..07ff9351e9e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1146,7 +1146,7 @@ static int cifs_writepages(struct address_space *mapping, pgoff_t end; pgoff_t index; int range_whole = 0; - struct kvec iov[32]; + struct kvec * iov; int len; int n_iov = 0; pgoff_t next; @@ -1171,15 +1171,21 @@ static int cifs_writepages(struct address_space *mapping, if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - if(!experimEnabled) + if(!experimEnabled) return generic_writepages(mapping, wbc); + iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL); + if(iov == NULL) + return generic_writepages(mapping, wbc); + + /* * BB: Is this meaningful for a non-block-device file system? * If it is, we should test it again after we do I/O */ if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; + kfree(iov); return 0; } @@ -1345,7 +1351,7 @@ retry: mapping->writeback_index = index; FreeXid(xid); - + kfree(iov); return rc; } @@ -1948,7 +1954,7 @@ static int cifs_readpage(struct file *file, struct page *page) refreshing the inode only on increases in the file size but this is tricky to do without racing with writebehind page caching in the current Linux kernel design */ -int is_size_safe_to_change(struct cifsInodeInfo *cifsInode) +int is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) { struct cifsFileInfo *open_file = NULL; @@ -1970,6 +1976,9 @@ int is_size_safe_to_change(struct cifsInodeInfo *cifsInode) return 1; } + if(i_size_read(&cifsInode->vfs_inode) < end_of_file) + return 1; + return 0; } else return 1; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index c4fa91b8b62..3f5bc83dc3d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -140,7 +140,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, inode->i_gid = le64_to_cpu(findData.Gid); inode->i_nlink = le64_to_cpu(findData.Nlinks); - if (is_size_safe_to_change(cifsInfo)) { + if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* can not safely change the file size here if the client is writing to it due to potential races */ @@ -491,8 +491,8 @@ int cifs_get_inode_info(struct inode **pinode, /* BB add code here - validate if device or weird share or device type? */ } - if (is_size_safe_to_change(cifsInfo)) { - /* can not safely change the file size here if the + if (is_size_safe_to_change(cifsInfo, le64_to_cpu(pfindData->EndOfFile))) { + /* can not safely shrink the file size here if the client is writing to it due to potential races */ i_size_write(inode,le64_to_cpu(pfindData->EndOfFile)); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 99dfb5337e3..c6220bd2716 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -156,9 +156,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, tmp_inode->i_atime = cnvrtDosUnixTm( le16_to_cpu(pfindData->LastAccessDate), le16_to_cpu(pfindData->LastAccessTime)); - tmp_inode->i_ctime = cnvrtDosUnixTm( - le16_to_cpu(pfindData->LastWriteDate), - le16_to_cpu(pfindData->LastWriteTime)); + tmp_inode->i_ctime = cnvrtDosUnixTm( + le16_to_cpu(pfindData->LastWriteDate), + le16_to_cpu(pfindData->LastWriteTime)); AdjustForTZ(cifs_sb->tcon, tmp_inode); attr = le16_to_cpu(pfindData->Attributes); allocation_size = le32_to_cpu(pfindData->AllocationSize); @@ -222,7 +222,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, atomic_set(&cifsInfo->inUse, 1); } - if (is_size_safe_to_change(cifsInfo)) { + if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* can not safely change the file size here if the client is writing to it due to potential races */ i_size_write(tmp_inode, end_of_file); @@ -351,10 +351,10 @@ static void unix_fill_in_inode(struct inode *tmp_inode, tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); - if (is_size_safe_to_change(cifsInfo)) { + if (is_size_safe_to_change(cifsInfo, end_of_file)) { /* can not safely change the file size here if the client is writing to it due to potential races */ - i_size_write(tmp_inode,end_of_file); + i_size_write(tmp_inode, end_of_file); /* 512 bytes (2**9) is the fake blocksize that must be used */ /* for this calculation, not the real blocksize */ diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c index 7a1b2b961ec..1b1daf63f06 100644 --- a/fs/cifs/smbdes.c +++ b/fs/cifs/smbdes.c @@ -196,7 +196,7 @@ dohash(char *out, char *in, char *key, int forw) char c[28]; char d[28]; char *cd; - char ki[16][48]; + char (*ki)[48]; char *pd1; char l[32], r[32]; char *rl; @@ -206,6 +206,12 @@ dohash(char *out, char *in, char *key, int forw) if(pk1 == NULL) return; + ki = kmalloc(16*48, GFP_KERNEL); + if(ki == NULL) { + kfree(pk1); + return; + } + cd = pk1 + 56; pd1= cd + 56; rl = pd1 + 64; @@ -243,6 +249,7 @@ dohash(char *out, char *in, char *key, int forw) er = kmalloc(48+48+32+32+32, GFP_KERNEL); if(er == NULL) { kfree(pk1); + kfree(ki); return; } erk = er+48; @@ -290,6 +297,7 @@ dohash(char *out, char *in, char *key, int forw) permute(out, rl, perm6, 64); kfree(pk1); + kfree(ki); } static void diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 4c9fecbfa91..28c872747f8 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -16,7 +16,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) return memcmp(fid1, fid2, sizeof(*fid1)) == 0; } -static struct inode_operations coda_symlink_inode_operations = { +static const struct inode_operations coda_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 0c6f7f3b3dd..9ddf5ed6216 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -66,7 +66,7 @@ static struct dentry_operations coda_dentry_operations = .d_delete = coda_dentry_delete, }; -struct inode_operations coda_dir_inode_operations = +const struct inode_operations coda_dir_inode_operations = { .create = coda_create, .lookup = coda_lookup, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 01395defed8..614175a3b02 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -90,7 +90,7 @@ static int coda_remount(struct super_block *sb, int *flags, char *data) } /* exported operations */ -static struct super_operations coda_super_operations = +static const struct super_operations coda_super_operations = { .alloc_inode = coda_alloc_inode, .destroy_inode = coda_destroy_inode, @@ -271,7 +271,7 @@ int coda_setattr(struct dentry *de, struct iattr *iattr) return error; } -struct inode_operations coda_file_inode_operations = { +const struct inode_operations coda_file_inode_operations = { .permission = coda_permission, .getattr = coda_getattr, .setattr = coda_setattr, diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 214822be87b..2bf3026adc8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -30,7 +30,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp, unsigned int cmd, unsigned long user_data); /* exported from this file */ -struct inode_operations coda_ioctl_inode_operations = +const struct inode_operations coda_ioctl_inode_operations = { .permission = coda_ioctl_permission, .setattr = coda_setattr, diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 1c82e9a7d7c..c57a1fa7cf2 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/ctype.h> @@ -32,8 +33,6 @@ static struct ctl_table_header *fs_table_header; -#define FS_CODA 1 /* Coda file system */ - #define CODA_TIMEOUT 3 /* timeout on upcalls to become intrble */ #define CODA_HARD 5 /* mount type "hard" or "soft" */ #define CODA_VFS 6 /* vfs statistics */ @@ -84,15 +83,11 @@ static int do_reset_coda_cache_inv_stats( ctl_table * table, int write, return 0; } -static int coda_vfs_stats_get_info( char * buffer, char ** start, - off_t offset, int length) +static int proc_vfs_stats_show(struct seq_file *m, void *v) { - int len=0; - off_t begin; struct coda_vfs_stats * ps = & coda_vfs_stat; - /* this works as long as we are below 1024 characters! */ - len += sprintf( buffer, + seq_printf(m, "Coda VFS statistics\n" "===================\n\n" "File Operations:\n" @@ -132,28 +127,14 @@ static int coda_vfs_stats_get_info( char * buffer, char ** start, ps->rmdir, ps->rename, ps->permission); - - begin = offset; - *start = buffer + begin; - len -= begin; - - if ( len > length ) - len = length; - if ( len < 0 ) - len = 0; - - return len; + return 0; } -static int coda_cache_inv_stats_get_info( char * buffer, char ** start, - off_t offset, int length) +static int proc_cache_inv_stats_show(struct seq_file *m, void *v) { - int len=0; - off_t begin; struct coda_cache_inv_stats * ps = & coda_cache_inv_stat; - /* this works as long as we are below 1024 characters! */ - len += sprintf( buffer, + seq_printf(m, "Coda cache invalidation statistics\n" "==================================\n\n" "flush\t\t%9d\n" @@ -170,31 +151,87 @@ static int coda_cache_inv_stats_get_info( char * buffer, char ** start, ps->zap_vnode, ps->purge_fid, ps->replace ); - - begin = offset; - *start = buffer + begin; - len -= begin; + return 0; +} - if ( len > length ) - len = length; - if ( len < 0 ) - len = 0; +static int proc_vfs_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_vfs_stats_show, NULL); +} - return len; +static int proc_cache_inv_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cache_inv_stats_show, NULL); } +static const struct file_operations proc_vfs_stats_fops = { + .owner = THIS_MODULE, + .open = proc_vfs_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations proc_cache_inv_stats_fops = { + .owner = THIS_MODULE, + .open = proc_cache_inv_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ctl_table coda_table[] = { - {CODA_TIMEOUT, "timeout", &coda_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, - {CODA_HARD, "hard", &coda_hard, sizeof(int), 0644, NULL, &proc_dointvec}, - {CODA_VFS, "vfs_stats", NULL, 0, 0644, NULL, &do_reset_coda_vfs_stats}, - {CODA_CACHE_INV, "cache_inv_stats", NULL, 0, 0644, NULL, &do_reset_coda_cache_inv_stats}, - {CODA_FAKE_STATFS, "fake_statfs", &coda_fake_statfs, sizeof(int), 0600, NULL, &proc_dointvec}, - { 0 } + { + .ctl_name = CTL_UNNUMBERED, + .procname = "timeout", + .data = &coda_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hard", + .data = &coda_hard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "vfs_stats", + .data = NULL, + .maxlen = 0, + .mode = 0644, + .proc_handler = &do_reset_coda_vfs_stats + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "cache_inv_stats", + .data = NULL, + .maxlen = 0, + .mode = 0644, + .proc_handler = &do_reset_coda_cache_inv_stats + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "fake_statfs", + .data = &coda_fake_statfs, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &proc_dointvec + }, + {} }; static ctl_table fs_table[] = { - {FS_CODA, "coda", NULL, 0, 0555, coda_table}, - {0} + { + .ctl_name = CTL_UNNUMBERED, + .procname = "coda", + .mode = 0555, + .child = coda_table + }, + {} }; @@ -212,9 +249,6 @@ static struct proc_dir_entry* proc_fs_coda; #endif -#define coda_proc_create(name,get_info) \ - create_proc_info_entry(name, 0, proc_fs_coda, get_info) - void coda_sysctl_init(void) { reset_coda_vfs_stats(); @@ -223,15 +257,21 @@ void coda_sysctl_init(void) #ifdef CONFIG_PROC_FS proc_fs_coda = proc_mkdir("coda", proc_root_fs); if (proc_fs_coda) { + struct proc_dir_entry *pde; + proc_fs_coda->owner = THIS_MODULE; - coda_proc_create("vfs_stats", coda_vfs_stats_get_info); - coda_proc_create("cache_inv_stats", coda_cache_inv_stats_get_info); + pde = create_proc_entry("vfs_stats", 0, proc_fs_coda); + if (pde) + pde->proc_fops = &proc_vfs_stats_fops; + pde = create_proc_entry("cache_inv_stats", 0, proc_fs_coda); + if (pde) + pde->proc_fops = &proc_cache_inv_stats_fops; } #endif #ifdef CONFIG_SYSCTL if ( !fs_table_header ) - fs_table_header = register_sysctl_table(fs_table, 0); + fs_table_header = register_sysctl_table(fs_table); #endif } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index f92cd303d2c..7b48c034b31 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -75,8 +75,8 @@ extern struct super_block * configfs_sb; extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; extern const struct file_operations bin_fops; -extern struct inode_operations configfs_dir_inode_operations; -extern struct inode_operations configfs_symlink_inode_operations; +extern const struct inode_operations configfs_dir_inode_operations; +extern const struct inode_operations configfs_symlink_inode_operations; extern int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 1814ba44680..34750d5e4ff 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -72,11 +72,10 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare { struct configfs_dirent * sd; - sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL); + sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; - memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); INIT_LIST_HEAD(&sd->s_links); INIT_LIST_HEAD(&sd->s_children); @@ -931,7 +930,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -struct inode_operations configfs_dir_inode_operations = { +const struct inode_operations configfs_dir_inode_operations = { .mkdir = configfs_mkdir, .rmdir = configfs_rmdir, .symlink = configfs_symlink, diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2a7cb086e80..d98be5e0132 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -162,14 +162,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size int error; if (!buffer->page) - buffer->page = (char *)get_zeroed_page(GFP_KERNEL); + buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); if (!buffer->page) return -ENOMEM; - if (count > PAGE_SIZE) - count = PAGE_SIZE; + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; + /* if buf is assumed to contain a string, terminate it by \0, + * so e.g. sscanf() can scan the string easily */ + buffer->page[count] = 0; return error ? -EFAULT : count; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fb18917954a..2ec9beac17c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -49,7 +49,7 @@ static struct backing_dev_info configfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -static struct inode_operations configfs_inode_operations ={ +static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index ed678529ebb..6f573004cd7 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -41,7 +41,7 @@ struct super_block * configfs_sb = NULL; struct kmem_cache *configfs_dir_cachep; static int configfs_mnt_count = 0; -static struct super_operations configfs_ops = { +static const struct super_operations configfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index fb65e0800a8..22700d2857d 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -272,7 +272,7 @@ static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, } } -struct inode_operations configfs_symlink_inode_operations = { +const struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, .put_link = configfs_put_link, diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 6db03fb089d..facd0c89be8 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -27,8 +27,8 @@ #include <asm/uaccess.h> -static struct super_operations cramfs_ops; -static struct inode_operations cramfs_dir_inode_operations; +static const struct super_operations cramfs_ops; +static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; static const struct address_space_operations cramfs_aops; @@ -518,11 +518,11 @@ static const struct file_operations cramfs_directory_operations = { .readdir = cramfs_readdir, }; -static struct inode_operations cramfs_dir_inode_operations = { +static const struct inode_operations cramfs_dir_inode_operations = { .lookup = cramfs_lookup, }; -static struct super_operations cramfs_ops = { +static const struct super_operations cramfs_ops = { .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index bf3901ab174..682f928b7f4 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/namei.h> #include <linux/debugfs.h> static ssize_t default_read_file(struct file *file, char __user *buf, @@ -44,6 +45,17 @@ const struct file_operations debugfs_file_operations = { .open = default_open, }; +static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, dentry->d_inode->i_private); + return NULL; +} + +const struct inode_operations debugfs_link_operations = { + .readlink = generic_readlink, + .follow_link = debugfs_follow_link, +}; + static void debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; @@ -254,7 +266,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, blob->size); } -static struct file_operations fops_blob = { +static const struct file_operations fops_blob = { .read = read_file_blob, .open = default_open, }; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c692487346e..7b324cfebcb 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -25,11 +25,13 @@ #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> +#include <linux/string.h> #define DEBUGFS_MAGIC 0x64626720 /* declared over in file.c */ extern struct file_operations debugfs_file_operations; +extern struct inode_operations debugfs_link_operations; static struct vfsmount *debugfs_mount; static int debugfs_mount_count; @@ -51,6 +53,9 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d case S_IFREG: inode->i_fop = &debugfs_file_operations; break; + case S_IFLNK: + inode->i_op = &debugfs_link_operations; + break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -96,6 +101,12 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return res; } +static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode) +{ + mode = (mode & S_IALLUGO) | S_IFLNK; + return debugfs_mknod(dir, dentry, mode, 0); +} + static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) { int res; @@ -158,10 +169,17 @@ static int debugfs_create_by_name(const char *name, mode_t mode, mutex_lock(&parent->d_inode->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) { - if ((mode & S_IFMT) == S_IFDIR) + switch (mode & S_IFMT) { + case S_IFDIR: error = debugfs_mkdir(parent->d_inode, *dentry, mode); - else + break; + case S_IFLNK: + error = debugfs_link(parent->d_inode, *dentry, mode); + break; + default: error = debugfs_create(parent->d_inode, *dentry, mode); + break; + } dput(*dentry); } else error = PTR_ERR(*dentry); @@ -194,9 +212,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * you are responsible here.) If an error occurs, %NULL will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * returned. */ struct dentry *debugfs_create_file(const char *name, mode_t mode, struct dentry *parent, void *data, @@ -246,9 +262,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); * you are responsible here.) If an error occurs, %NULL will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * returned. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { @@ -259,6 +273,47 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) EXPORT_SYMBOL_GPL(debugfs_create_dir); /** + * debugfs_create_symlink- create a symbolic link in the debugfs filesystem + * @name: a pointer to a string containing the name of the symbolic link to + * create. + * @parent: a pointer to the parent dentry for this symbolic link. This + * should be a directory dentry if set. If this paramater is NULL, + * then the symbolic link will be created in the root of the debugfs + * filesystem. + * @target: a pointer to a string containing the path to the target of the + * symbolic link. + * + * This function creates a symbolic link with the given name in debugfs that + * links to the given target path. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the symbolic + * link is to be removed (no automatic cleanup happens if your module is + * unloaded, you are responsible here.) If an error occurs, %NULL will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, + const char *target) +{ + struct dentry *result; + char *link; + + link = kstrdup(target, GFP_KERNEL); + if (!link) + return NULL; + + result = debugfs_create_file(name, S_IFLNK | S_IRWXUGO, parent, link, + NULL); + if (!result) + kfree(link); + return result; +} +EXPORT_SYMBOL_GPL(debugfs_create_symlink); + +/** * debugfs_remove - removes a file or directory from the debugfs filesystem * @dentry: a pointer to a the dentry of the file or directory to be * removed. @@ -287,15 +342,22 @@ void debugfs_remove(struct dentry *dentry) if (debugfs_positive(dentry)) { if (dentry->d_inode) { dget(dentry); - if (S_ISDIR(dentry->d_inode->i_mode)) { + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: ret = simple_rmdir(parent->d_inode, dentry); if (ret) printk(KERN_ERR "DebugFS rmdir on %s failed : " "directory not empty.\n", dentry->d_name.name); - } else + break; + case S_IFLNK: + kfree(dentry->d_inode->i_private); + /* fall through */ + default: simple_unlink(parent->d_inode, dentry); + break; + } if (!ret) d_delete(dentry); dput(dentry); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 5f7b5a6025b..643e57b622b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -91,7 +91,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations devpts_sops = { +static const struct super_operations devpts_sops = { .statfs = simple_statfs, .remount_fs = devpts_remount, }; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index b5654a284fe..6fa7b0d5c04 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,21 +3,21 @@ menu "Distributed Lock Manager" config DLM tristate "Distributed Lock Manager (DLM)" - depends on IPV6 || IPV6=n + depends on SYSFS && (IPV6 || IPV6=n) select CONFIGFS_FS select IP_SCTP if DLM_SCTP help - A general purpose distributed lock manager for kernel or userspace - applications. + A general purpose distributed lock manager for kernel or userspace + applications. choice prompt "Select DLM communications protocol" depends on DLM default DLM_TCP help - The DLM Can use TCP or SCTP for it's network communications. - SCTP supports multi-homed operations whereas TCP doesn't. - However, SCTP seems to have stability problems at the moment. + The DLM Can use TCP or SCTP for it's network communications. + SCTP supports multi-homed operations whereas TCP doesn't. + However, SCTP seems to have stability problems at the moment. config DLM_TCP bool "TCP/IP" @@ -31,8 +31,8 @@ config DLM_DEBUG bool "DLM debugging" depends on DLM help - Under the debugfs mount point, the name of each lockspace will - appear as a file in the "dlm" directory. The output is the - list of resource and locks the local node knows about. + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. endmenu diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 88553054bbf..8665c88e5af 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -54,6 +54,11 @@ static struct config_item *make_node(struct config_group *, const char *); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len); static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf); static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, @@ -73,6 +78,101 @@ static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); static ssize_t node_weight_read(struct node *nd, char *buf); static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); +struct cluster { + struct config_group group; + unsigned int cl_tcp_port; + unsigned int cl_buffer_size; + unsigned int cl_rsbtbl_size; + unsigned int cl_lkbtbl_size; + unsigned int cl_dirtbl_size; + unsigned int cl_recover_timer; + unsigned int cl_toss_secs; + unsigned int cl_scan_secs; + unsigned int cl_log_debug; +}; + +enum { + CLUSTER_ATTR_TCP_PORT = 0, + CLUSTER_ATTR_BUFFER_SIZE, + CLUSTER_ATTR_RSBTBL_SIZE, + CLUSTER_ATTR_LKBTBL_SIZE, + CLUSTER_ATTR_DIRTBL_SIZE, + CLUSTER_ATTR_RECOVER_TIMER, + CLUSTER_ATTR_TOSS_SECS, + CLUSTER_ATTR_SCAN_SECS, + CLUSTER_ATTR_LOG_DEBUG, +}; + +struct cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct cluster *, char *); + ssize_t (*store)(struct cluster *, const char *, size_t); +}; + +static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, + unsigned int *info_field, int check_zero, + const char *buf, size_t len) +{ + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + *cl_field = x; + *info_field = x; + + return len; +} + +#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \ + .attr = { .ca_name = __stringify(_name), \ + .ca_mode = _mode, \ + .ca_owner = THIS_MODULE }, \ + .show = _read, \ + .store = _write, \ +} + +#define CLUSTER_ATTR(name, check_zero) \ +static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ +{ \ + return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ + check_zero, buf, len); \ +} \ +static ssize_t name##_read(struct cluster *cl, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ +} \ +static struct cluster_attribute cluster_attr_##name = \ +__CONFIGFS_ATTR(name, 0644, name##_read, name##_write) + +CLUSTER_ATTR(tcp_port, 1); +CLUSTER_ATTR(buffer_size, 1); +CLUSTER_ATTR(rsbtbl_size, 1); +CLUSTER_ATTR(lkbtbl_size, 1); +CLUSTER_ATTR(dirtbl_size, 1); +CLUSTER_ATTR(recover_timer, 1); +CLUSTER_ATTR(toss_secs, 1); +CLUSTER_ATTR(scan_secs, 1); +CLUSTER_ATTR(log_debug, 0); + +static struct configfs_attribute *cluster_attrs[] = { + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, + [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr, + [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, + NULL, +}; + enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, @@ -152,10 +252,6 @@ struct clusters { struct configfs_subsystem subsys; }; -struct cluster { - struct config_group group; -}; - struct spaces { struct config_group ss_group; }; @@ -197,6 +293,8 @@ static struct configfs_group_operations clusters_ops = { static struct configfs_item_operations cluster_ops = { .release = release_cluster, + .show_attribute = show_cluster, + .store_attribute = store_cluster, }; static struct configfs_group_operations spaces_ops = { @@ -237,6 +335,7 @@ static struct config_item_type clusters_type = { static struct config_item_type cluster_type = { .ct_item_ops = &cluster_ops, + .ct_attrs = cluster_attrs, .ct_owner = THIS_MODULE, }; @@ -317,6 +416,16 @@ static struct config_group *make_cluster(struct config_group *g, cl->group.default_groups[1] = &cms->cs_group; cl->group.default_groups[2] = NULL; + cl->cl_tcp_port = dlm_config.ci_tcp_port; + cl->cl_buffer_size = dlm_config.ci_buffer_size; + cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; + cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size; + cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; + cl->cl_recover_timer = dlm_config.ci_recover_timer; + cl->cl_toss_secs = dlm_config.ci_toss_secs; + cl->cl_scan_secs = dlm_config.ci_scan_secs; + cl->cl_log_debug = dlm_config.ci_log_debug; + space_list = &sps->ss_group; comm_list = &cms->cs_group; return &cl->group; @@ -509,6 +618,25 @@ void dlm_config_exit(void) * Functions for user space to read/write attributes */ +static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct cluster *cl = to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->show ? cla->show(cl, buf) : 0; +} + +static ssize_t store_cluster(struct config_item *i, + struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct cluster *cl = to_cluster(i); + struct cluster_attribute *cla = + container_of(a, struct cluster_attribute, attr); + return cla->store ? cla->store(cl, buf, len) : -EINVAL; +} + static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf) { @@ -775,15 +903,17 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 +#define DEFAULT_LOG_DEBUG 0 struct dlm_config_info dlm_config = { - .tcp_port = DEFAULT_TCP_PORT, - .buffer_size = DEFAULT_BUFFER_SIZE, - .rsbtbl_size = DEFAULT_RSBTBL_SIZE, - .lkbtbl_size = DEFAULT_LKBTBL_SIZE, - .dirtbl_size = DEFAULT_DIRTBL_SIZE, - .recover_timer = DEFAULT_RECOVER_TIMER, - .toss_secs = DEFAULT_TOSS_SECS, - .scan_secs = DEFAULT_SCAN_SECS + .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE, + .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, + .ci_recover_timer = DEFAULT_RECOVER_TIMER, + .ci_toss_secs = DEFAULT_TOSS_SECS, + .ci_scan_secs = DEFAULT_SCAN_SECS, + .ci_log_debug = DEFAULT_LOG_DEBUG }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 9da7839958a..1e978611a96 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -17,14 +17,15 @@ #define DLM_MAX_ADDR_COUNT 3 struct dlm_config_info { - int tcp_port; - int buffer_size; - int rsbtbl_size; - int lkbtbl_size; - int dirtbl_size; - int recover_timer; - int toss_secs; - int scan_secs; + int ci_tcp_port; + int ci_buffer_size; + int ci_rsbtbl_size; + int ci_lkbtbl_size; + int ci_dirtbl_size; + int ci_recover_timer; + int ci_toss_secs; + int ci_scan_secs; + int ci_log_debug; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index ca94a837a5b..61ba670b9e0 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -287,7 +287,7 @@ static int rsb_open(struct inode *inode, struct file *file) return 0; } -static struct file_operations rsb_fops = { +static const struct file_operations rsb_fops = { .owner = THIS_MODULE, .open = rsb_open, .read = seq_read, @@ -331,7 +331,7 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf, return rv; } -static struct file_operations waiters_fops = { +static const struct file_operations waiters_fops = { .owner = THIS_MODULE, .open = waiters_open, .read = waiters_read diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 1ee8195e6fc..61d93201e1b 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -41,6 +41,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> +#include "config.h" #define DLM_LOCKSPACE_LEN 64 @@ -69,12 +70,12 @@ struct dlm_mhandle; #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) -#define DLM_LOG_DEBUG -#ifdef DLM_LOG_DEBUG -#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args) -#else -#define log_debug(ls, fmt, args...) -#endif +#define log_debug(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) #define DLM_ASSERT(x, do) \ { \ @@ -309,8 +310,8 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ -#define DLM_HEADER_MAJOR 0x00020000 -#define DLM_HEADER_MINOR 0x00000001 +#define DLM_HEADER_MAJOR 0x00030000 +#define DLM_HEADER_MINOR 0x00000000 #define DLM_MSG 1 #define DLM_RCOM 2 @@ -386,6 +387,8 @@ struct dlm_rcom { uint32_t rc_type; /* DLM_RCOM_ */ int rc_result; /* multi-purpose */ uint64_t rc_id; /* match reply with request */ + uint64_t rc_seq; /* sender's ls_recover_seq */ + uint64_t rc_seq_reply; /* remote ls_recover_seq */ char rc_buf[0]; }; @@ -523,6 +526,7 @@ struct dlm_user_proc { spinlock_t asts_spin; struct list_head locks; spinlock_t locks_spin; + struct list_head unlocking; wait_queue_head_t wait; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 30878defaeb..e725005fafd 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -754,6 +754,11 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype) mutex_unlock(&ls->ls_waiters_mutex); } +/* We clear the RESEND flag because we might be taking an lkb off the waiters + list as part of process_requestqueue (e.g. a lookup that has an optimized + request reply on the requestqueue) between dlm_recover_waiters_pre() which + set RESEND and dlm_recover_waiters_post() */ + static int _remove_from_waiters(struct dlm_lkb *lkb) { int error = 0; @@ -764,6 +769,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb) goto out; } lkb->lkb_wait_type = 0; + lkb->lkb_flags &= ~DLM_IFL_RESEND; list_del(&lkb->lkb_wait_reply); unhold_lkb(lkb); out: @@ -810,7 +816,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, res_hashchain) { if (!time_after_eq(jiffies, r->res_toss_time + - dlm_config.toss_secs * HZ)) + dlm_config.ci_toss_secs * HZ)) continue; found = 1; break; @@ -2144,12 +2150,24 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, if (lkb->lkb_astaddr) ms->m_asts |= AST_COMP; - if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP) - memcpy(ms->m_extra, r->res_name, r->res_length); + /* compare with switch in create_message; send_remove() doesn't + use send_args() */ - else if (lkb->lkb_lvbptr) + switch (ms->m_type) { + case DLM_MSG_REQUEST: + case DLM_MSG_LOOKUP: + memcpy(ms->m_extra, r->res_name, r->res_length); + break; + case DLM_MSG_CONVERT: + case DLM_MSG_UNLOCK: + case DLM_MSG_REQUEST_REPLY: + case DLM_MSG_CONVERT_REPLY: + case DLM_MSG_GRANT: + if (!lkb->lkb_lvbptr) + break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); - + break; + } } static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) @@ -2418,8 +2436,12 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); - if (receive_lvb(ls, lkb, ms)) - return -ENOMEM; + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { + /* lkb was just created so there won't be an lvb yet */ + lkb->lkb_lvbptr = allocate_lvb(ls); + if (!lkb->lkb_lvbptr) + return -ENOMEM; + } return 0; } @@ -3002,7 +3024,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) { struct dlm_message *ms = (struct dlm_message *) hd; struct dlm_ls *ls; - int error; + int error = 0; if (!recovery) dlm_message_in(ms); @@ -3119,7 +3141,7 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) out: dlm_put_lockspace(ls); dlm_astd_wake(); - return 0; + return error; } @@ -3132,6 +3154,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) if (middle_conversion(lkb)) { hold_lkb(lkb); ls->ls_stub_ms.m_result = -EINPROGRESS; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_convert_reply(lkb, &ls->ls_stub_ms); @@ -3205,6 +3228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_UNLOCK: hold_lkb(lkb); ls->ls_stub_ms.m_result = -DLM_EUNLOCK; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_unlock_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); @@ -3213,6 +3237,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_CANCEL: hold_lkb(lkb); ls->ls_stub_ms.m_result = -DLM_ECANCEL; + ls->ls_stub_ms.m_flags = lkb->lkb_flags; _remove_from_waiters(lkb); _receive_cancel_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); @@ -3571,6 +3596,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) lock_rsb(r); switch (error) { + case -EBADR: + /* There's a chance the new master received our lock before + dlm_recover_master_reply(), this wouldn't happen if we did + a barrier between recover_masters and recover_locks. */ + log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id, + (unsigned long)r, r->res_name); + dlm_send_rcom_lock(r, lkb); + goto out; case -EEXIST: log_debug(ls, "master copy exists %x", lkb->lkb_id); /* fall through */ @@ -3585,7 +3618,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) /* an ack for dlm_recover_locks() which waits for replies from all the locks it sends to new masters */ dlm_recovered_lock(r); - + out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3610,7 +3643,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } if (flags & DLM_LKF_VALBLK) { - ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); if (!ua->lksb.sb_lvbptr) { kfree(ua); __put_lkb(ls, lkb); @@ -3679,7 +3712,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua = (struct dlm_user_args *)lkb->lkb_astparam; if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { - ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); if (!ua->lksb.sb_lvbptr) { error = -ENOMEM; goto out_put; @@ -3745,12 +3778,10 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, goto out_put; spin_lock(&ua->proc->locks_spin); - list_del_init(&lkb->lkb_ownqueue); + /* dlm_user_add_ast() may have already taken lkb off the proc list */ + if (!list_empty(&lkb->lkb_ownqueue)) + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); - - /* this removes the reference for the proc->locks list added by - dlm_user_request */ - unhold_lkb(lkb); out_put: dlm_put_lkb(lkb); out: @@ -3790,9 +3821,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, /* this lkb was removed from the WAITING queue */ if (lkb->lkb_grmode == DLM_LOCK_IV) { spin_lock(&ua->proc->locks_spin); - list_del_init(&lkb->lkb_ownqueue); + list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); - unhold_lkb(lkb); } out_put: dlm_put_lkb(lkb); @@ -3853,11 +3883,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) mutex_lock(&ls->ls_clear_proc_locks); list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) { - if (lkb->lkb_ast_type) { - list_del(&lkb->lkb_astqueue); - unhold_lkb(lkb); - } - list_del_init(&lkb->lkb_ownqueue); if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) { @@ -3874,6 +3899,20 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } + + /* in-progress unlocks */ + list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags |= DLM_IFL_DEAD; + dlm_put_lkb(lkb); + } + + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_del(&lkb->lkb_astqueue); + dlm_put_lkb(lkb); + } + mutex_unlock(&ls->ls_clear_proc_locks); unlock_recovery(ls); } + diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 59012b089e8..f40817b53c6 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -236,7 +236,7 @@ static int dlm_scand(void *data) while (!kthread_should_stop()) { list_for_each_entry(ls, &lslist, ls_list) dlm_scan_rsbs(ls); - schedule_timeout_interruptible(dlm_config.scan_secs * HZ); + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } @@ -422,7 +422,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_count = 0; ls->ls_flags = 0; - size = dlm_config.rsbtbl_size; + size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); @@ -434,7 +434,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, rwlock_init(&ls->ls_rsbtbl[i].lock); } - size = dlm_config.lkbtbl_size; + size = dlm_config.ci_lkbtbl_size; ls->ls_lkbtbl_size = size; ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); @@ -446,7 +446,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_lkbtbl[i].counter = 1; } - size = dlm_config.dirtbl_size; + size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); @@ -489,7 +489,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL); + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (!ls->ls_recover_buf) goto out_dirfree; diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c index fe158d7a928..dc83a9d979b 100644 --- a/fs/dlm/lowcomms-sctp.c +++ b/fs/dlm/lowcomms-sctp.c @@ -72,6 +72,8 @@ struct nodeinfo { struct list_head writequeue; /* outgoing writequeue_entries */ spinlock_t writequeue_lock; int nodeid; + struct work_struct swork; /* Send workqueue */ + struct work_struct lwork; /* Locking workqueue */ }; static DEFINE_IDR(nodeinfo_idr); @@ -96,6 +98,7 @@ struct connection { atomic_t waiting_requests; struct cbuf cb; int eagain_flag; + struct work_struct work; /* Send workqueue */ }; /* An entry waiting to be sent */ @@ -137,19 +140,23 @@ static void cbuf_eat(struct cbuf *cb, int n) static LIST_HEAD(write_nodes); static DEFINE_SPINLOCK(write_nodes_lock); + /* Maximum number of incoming messages to process before * doing a schedule() */ #define MAX_RX_MSG_COUNT 25 -/* Manage daemons */ -static struct task_struct *recv_task; -static struct task_struct *send_task; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_wait); +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; +static struct workqueue_struct *lock_workqueue; /* The SCTP connection */ static struct connection sctp_con; +static void process_send_sockets(struct work_struct *work); +static void process_recv_sockets(struct work_struct *work); +static void process_lock_request(struct work_struct *work); static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) { @@ -222,6 +229,8 @@ static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc) spin_lock_init(&ni->lock); INIT_LIST_HEAD(&ni->writequeue); spin_lock_init(&ni->writequeue_lock); + INIT_WORK(&ni->lwork, process_lock_request); + INIT_WORK(&ni->swork, process_send_sockets); ni->nodeid = nodeid; if (nodeid > max_nodeid) @@ -249,11 +258,8 @@ static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc) /* Data or notification available on socket */ static void lowcomms_data_ready(struct sock *sk, int count_unused) { - atomic_inc(&sctp_con.waiting_requests); if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) - return; - - wake_up_interruptible(&lowcomms_recv_wait); + queue_work(recv_workqueue, &sctp_con.work); } @@ -361,10 +367,10 @@ static void init_failed(void) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } } } - wake_up_process(send_task); } /* Something happened to an association */ @@ -446,8 +452,8 @@ static void process_sctp_notification(struct msghdr *msg, char *buf) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } - wake_up_process(send_task); } break; @@ -580,8 +586,8 @@ static int receive_from_sock(void) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); + queue_work(send_workqueue, &ni->swork); } - wake_up_process(send_task); } } @@ -590,6 +596,7 @@ static int receive_from_sock(void) return 0; cbuf_add(&sctp_con.cb, ret); + // PJC: TODO: Add to node's workqueue....can we ?? ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), page_address(sctp_con.rx_page), sctp_con.cb.base, sctp_con.cb.len, @@ -635,7 +642,7 @@ static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) if (result < 0) log_print("Can't bind to port %d addr number %d", - dlm_config.tcp_port, num); + dlm_config.ci_tcp_port, num); return result; } @@ -711,7 +718,7 @@ static int init_sock(void) /* Bind to all interfaces. */ for (i = 0; i < dlm_local_count; i++) { memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); - make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); result = add_bind_addr(&localaddr, addr_len, num); if (result) @@ -820,7 +827,8 @@ void dlm_lowcomms_commit_buffer(void *arg) spin_lock_bh(&write_nodes_lock); list_add_tail(&ni->write_list, &write_nodes); spin_unlock_bh(&write_nodes_lock); - wake_up_process(send_task); + + queue_work(send_workqueue, &ni->swork); } return; @@ -863,7 +871,7 @@ static void initiate_association(int nodeid) return; } - make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); + make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); outmessage.msg_name = &rem_addr; outmessage.msg_namelen = addrlen; @@ -1088,101 +1096,75 @@ int dlm_lowcomms_close(int nodeid) return 0; } -static int write_list_empty(void) +// PJC: The work queue function for receiving. +static void process_recv_sockets(struct work_struct *work) { - int status; - - spin_lock_bh(&write_nodes_lock); - status = list_empty(&write_nodes); - spin_unlock_bh(&write_nodes_lock); - - return status; -} - -static int dlm_recvd(void *data) -{ - DECLARE_WAITQUEUE(wait, current); - - while (!kthread_should_stop()) { + if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { + int ret; int count = 0; - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&lowcomms_recv_wait, &wait); - if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) - cond_resched(); - remove_wait_queue(&lowcomms_recv_wait, &wait); - set_current_state(TASK_RUNNING); - - if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { - int ret; - - do { - ret = receive_from_sock(); + do { + ret = receive_from_sock(); - /* Don't starve out everyone else */ - if (++count >= MAX_RX_MSG_COUNT) { - cond_resched(); - count = 0; - } - } while (!kthread_should_stop() && ret >=0); - } - cond_resched(); + /* Don't starve out everyone else */ + if (++count >= MAX_RX_MSG_COUNT) { + cond_resched(); + count = 0; + } + } while (!kthread_should_stop() && ret >=0); } - - return 0; + cond_resched(); } -static int dlm_sendd(void *data) +// PJC: the work queue function for sending +static void process_send_sockets(struct work_struct *work) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (write_list_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - if (sctp_con.eagain_flag) { - sctp_con.eagain_flag = 0; - refill_write_queue(); - } - process_output_queue(); + if (sctp_con.eagain_flag) { + sctp_con.eagain_flag = 0; + refill_write_queue(); } + process_output_queue(); +} - remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); - - return 0; +// PJC: Process lock requests from a particular node. +// TODO: can we optimise this out on UP ?? +static void process_lock_request(struct work_struct *work) +{ } static void daemons_stop(void) { - kthread_stop(recv_task); - kthread_stop(send_task); + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); + destroy_workqueue(lock_workqueue); } static int daemons_start(void) { - struct task_struct *p; int error; + recv_workqueue = create_workqueue("dlm_recv"); + error = IS_ERR(recv_workqueue); + if (error) { + log_print("can't start dlm_recv %d", error); + return error; + } - p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); - error = IS_ERR(p); + send_workqueue = create_singlethread_workqueue("dlm_send"); + error = IS_ERR(send_workqueue); if (error) { - log_print("can't start dlm_recvd %d", error); + log_print("can't start dlm_send %d", error); + destroy_workqueue(recv_workqueue); return error; } - recv_task = p; - p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); - error = IS_ERR(p); + lock_workqueue = create_workqueue("dlm_rlock"); + error = IS_ERR(lock_workqueue); if (error) { - log_print("can't start dlm_sendd %d", error); - kthread_stop(recv_task); + log_print("can't start dlm_rlock %d", error); + destroy_workqueue(send_workqueue); + destroy_workqueue(recv_workqueue); return error; } - send_task = p; return 0; } @@ -1194,6 +1176,8 @@ int dlm_lowcomms_start(void) { int error; + INIT_WORK(&sctp_con.work, process_recv_sockets); + error = init_sock(); if (error) goto fail_sock; @@ -1224,4 +1208,3 @@ void dlm_lowcomms_stop(void) for (i = 0; i < dlm_local_count; i++) kfree(dlm_local_addr[i]); } - diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c index 9be3a440c42..07e0a122c32 100644 --- a/fs/dlm/lowcomms-tcp.c +++ b/fs/dlm/lowcomms-tcp.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -96,10 +96,7 @@ static bool cbuf_empty(struct cbuf *cb) struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct rw_semaphore sock_sem; /* Stop connect races */ - struct list_head read_list; /* On this list when ready for reading */ - struct list_head write_list; /* On this list when ready for writing */ - struct list_head state_list; /* On this list when ready to connect */ + struct mutex sock_mutex; unsigned long flags; /* bit 1,2 = We are on the read/write lists */ #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 @@ -112,9 +109,10 @@ struct connection { struct page *rx_page; struct cbuf cb; int retries; - atomic_t waiting_requests; #define MAX_CONNECT_RETRIES 3 struct connection *othercon; + struct work_struct rwork; /* Receive workqueue */ + struct work_struct swork; /* Send workqueue */ }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -131,14 +129,9 @@ struct writequeue_entry { static struct sockaddr_storage dlm_local_addr; -/* Manage daemons */ -static struct task_struct *recv_task; -static struct task_struct *send_task; - -static wait_queue_t lowcomms_send_waitq_head; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_send_waitq); -static wait_queue_t lowcomms_recv_waitq_head; -static DECLARE_WAIT_QUEUE_HEAD(lowcomms_recv_waitq); +/* Work queues */ +static struct workqueue_struct *recv_workqueue; +static struct workqueue_struct *send_workqueue; /* An array of pointers to connections, indexed by NODEID */ static struct connection **connections; @@ -146,17 +139,8 @@ static DECLARE_MUTEX(connections_lock); static struct kmem_cache *con_cache; static int conn_array_size; -/* List of sockets that have reads pending */ -static LIST_HEAD(read_sockets); -static DEFINE_SPINLOCK(read_sockets_lock); - -/* List of sockets which have writes pending */ -static LIST_HEAD(write_sockets); -static DEFINE_SPINLOCK(write_sockets_lock); - -/* List of sockets which have connects pending */ -static LIST_HEAD(state_sockets); -static DEFINE_SPINLOCK(state_sockets_lock); +static void process_recv_sockets(struct work_struct *work); +static void process_send_sockets(struct work_struct *work); static struct connection *nodeid2con(int nodeid, gfp_t allocation) { @@ -186,9 +170,11 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) goto finish; con->nodeid = nodeid; - init_rwsem(&con->sock_sem); + mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); connections[nodeid] = con; } @@ -203,41 +189,22 @@ static void lowcomms_data_ready(struct sock *sk, int count_unused) { struct connection *con = sock2con(sk); - atomic_inc(&con->waiting_requests); - if (test_and_set_bit(CF_READ_PENDING, &con->flags)) - return; - - spin_lock_bh(&read_sockets_lock); - list_add_tail(&con->read_list, &read_sockets); - spin_unlock_bh(&read_sockets_lock); - - wake_up_interruptible(&lowcomms_recv_waitq); + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); } static void lowcomms_write_space(struct sock *sk) { struct connection *con = sock2con(sk); - if (test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - return; - - spin_lock_bh(&write_sockets_lock); - list_add_tail(&con->write_list, &write_sockets); - spin_unlock_bh(&write_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); } static inline void lowcomms_connect_sock(struct connection *con) { - if (test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) - return; - - spin_lock_bh(&state_sockets_lock); - list_add_tail(&con->state_list, &state_sockets); - spin_unlock_bh(&state_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); } static void lowcomms_state_change(struct sock *sk) @@ -279,7 +246,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, /* Close a remote connection and tidy up */ static void close_connection(struct connection *con, bool and_other) { - down_write(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->sock) { sock_release(con->sock); @@ -294,24 +261,27 @@ static void close_connection(struct connection *con, bool and_other) con->rx_page = NULL; } con->retries = 0; - up_write(&con->sock_sem); + mutex_unlock(&con->sock_mutex); } /* Data received from remote end */ static int receive_from_sock(struct connection *con) { int ret = 0; - struct msghdr msg; - struct iovec iov[2]; - mm_segment_t fs; + struct msghdr msg = {}; + struct kvec iov[2]; unsigned len; int r; int call_again_soon = 0; + int nvec; - down_read(&con->sock_sem); + mutex_lock(&con->sock_mutex); + + if (con->sock == NULL) { + ret = -EAGAIN; + goto out_close; + } - if (con->sock == NULL) - goto out; if (con->rx_page == NULL) { /* * This doesn't need to be atomic, but I think it should @@ -323,21 +293,13 @@ static int receive_from_sock(struct connection *con) cbuf_init(&con->cb, PAGE_CACHE_SIZE); } - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iovlen = 1; - msg.msg_iov = iov; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_flags = 0; - /* * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. */ iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; + nvec = 1; /* * iov[1] is the bit of the circular buffer between the start of the @@ -347,18 +309,18 @@ static int receive_from_sock(struct connection *con) iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb); iov[1].iov_len = con->cb.base; iov[1].iov_base = page_address(con->rx_page); - msg.msg_iovlen = 2; + nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; - fs = get_fs(); - set_fs(get_ds()); - r = ret = sock_recvmsg(con->sock, &msg, len, + r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, MSG_DONTWAIT | MSG_NOSIGNAL); - set_fs(fs); if (ret <= 0) goto out_close; + if (ret == -EAGAIN) + goto out_resched; + if (ret == len) call_again_soon = 1; cbuf_add(&con->cb, ret); @@ -381,24 +343,26 @@ static int receive_from_sock(struct connection *con) con->rx_page = NULL; } -out: if (call_again_soon) goto out_resched; - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return 0; out_resched: - lowcomms_data_ready(con->sock->sk, 0); - up_read(&con->sock_sem); - cond_resched(); - return 0; + if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) + queue_work(recv_workqueue, &con->rwork); + mutex_unlock(&con->sock_mutex); + return -EAGAIN; out_close: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) { close_connection(con, false); /* Reconnect when there is something to send */ } + /* Don't return success if we really got EOF */ + if (ret == 0) + ret = -EAGAIN; return ret; } @@ -412,6 +376,7 @@ static int accept_from_sock(struct connection *con) int len; int nodeid; struct connection *newcon; + struct connection *addcon; memset(&peeraddr, 0, sizeof(peeraddr)); result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, @@ -419,7 +384,7 @@ static int accept_from_sock(struct connection *con) if (result < 0) return -ENOMEM; - down_read(&con->sock_sem); + mutex_lock_nested(&con->sock_mutex, 0); result = -ENOTCONN; if (con->sock == NULL) @@ -445,7 +410,7 @@ static int accept_from_sock(struct connection *con) if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { printk("dlm: connect from non cluster node\n"); sock_release(newsock); - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return -1; } @@ -462,7 +427,7 @@ static int accept_from_sock(struct connection *con) result = -ENOMEM; goto accept_err; } - down_write(&newcon->sock_sem); + mutex_lock_nested(&newcon->sock_mutex, 1); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -470,41 +435,45 @@ static int accept_from_sock(struct connection *con) othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); if (!othercon) { printk("dlm: failed to allocate incoming socket\n"); - up_write(&newcon->sock_sem); + mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; - init_rwsem(&othercon->sock_sem); + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; } othercon->sock = newsock; newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); + addcon = othercon; } else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; add_sock(newsock, newcon); - + addcon = newcon; } - up_write(&newcon->sock_sem); + mutex_unlock(&newcon->sock_mutex); /* * Add it to the active queue in case we got data * beween processing the accept adding the socket * to the read_sockets list */ - lowcomms_data_ready(newsock->sk, 0); - up_read(&con->sock_sem); + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); return 0; accept_err: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); sock_release(newsock); if (result != -EAGAIN) @@ -525,7 +494,7 @@ static void connect_to_sock(struct connection *con) return; } - down_write(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -548,7 +517,7 @@ static void connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; - make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); add_sock(sock, con); @@ -577,7 +546,7 @@ out_err: result = 0; } out: - up_write(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return; } @@ -616,10 +585,10 @@ static struct socket *create_listen_sock(struct connection *con, con->sock = sock; /* Bind to our port */ - make_sockaddr(saddr, dlm_config.tcp_port, &addr_len); + make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); if (result < 0) { - printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port); + printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port); sock_release(sock); sock = NULL; con->sock = NULL; @@ -638,7 +607,7 @@ static struct socket *create_listen_sock(struct connection *con, result = sock->ops->listen(sock, 5); if (result < 0) { - printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port); + printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port); sock_release(sock); sock = NULL; goto create_out; @@ -709,6 +678,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, if (!con) return NULL; + spin_lock(&con->writequeue_lock); e = list_entry(con->writequeue.prev, struct writequeue_entry, list); if ((&e->list == &con->writequeue) || (PAGE_CACHE_SIZE - e->end < len)) { @@ -747,6 +717,7 @@ void dlm_lowcomms_commit_buffer(void *mh) struct connection *con = e->con; int users; + spin_lock(&con->writequeue_lock); users = --e->users; if (users) goto out; @@ -754,12 +725,8 @@ void dlm_lowcomms_commit_buffer(void *mh) kunmap(e->page); spin_unlock(&con->writequeue_lock); - if (test_and_set_bit(CF_WRITE_PENDING, &con->flags) == 0) { - spin_lock_bh(&write_sockets_lock); - list_add_tail(&con->write_list, &write_sockets); - spin_unlock_bh(&write_sockets_lock); - - wake_up_interruptible(&lowcomms_send_waitq); + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { + queue_work(send_workqueue, &con->swork); } return; @@ -783,7 +750,7 @@ static void send_to_sock(struct connection *con) struct writequeue_entry *e; int len, offset; - down_read(&con->sock_sem); + mutex_lock(&con->sock_mutex); if (con->sock == NULL) goto out_connect; @@ -800,6 +767,7 @@ static void send_to_sock(struct connection *con) offset = e->offset; BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); + kmap(e->page); ret = 0; if (len) { @@ -828,18 +796,18 @@ static void send_to_sock(struct connection *con) } spin_unlock(&con->writequeue_lock); out: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); return; send_error: - up_read(&con->sock_sem); + mutex_unlock(&con->sock_mutex); close_connection(con, false); lowcomms_connect_sock(con); return; out_connect: - up_read(&con->sock_sem); - lowcomms_connect_sock(con); + mutex_unlock(&con->sock_mutex); + connect_to_sock(con); return; } @@ -872,7 +840,6 @@ int dlm_lowcomms_close(int nodeid) if (con) { clean_one_writequeue(con); close_connection(con, true); - atomic_set(&con->waiting_requests, 0); } return 0; @@ -880,102 +847,29 @@ out: return -1; } -/* API send message call, may queue the request */ -/* N.B. This is the old interface - use the new one for new calls */ -int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation) -{ - struct writequeue_entry *e; - char *b; - - e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b); - if (e) { - memcpy(b, buf, len); - dlm_lowcomms_commit_buffer(e); - return 0; - } - return -ENOBUFS; -} - /* Look for activity on active sockets */ -static void process_sockets(void) +static void process_recv_sockets(struct work_struct *work) { - struct list_head *list; - struct list_head *temp; - int count = 0; - - spin_lock_bh(&read_sockets_lock); - list_for_each_safe(list, temp, &read_sockets) { - - struct connection *con = - list_entry(list, struct connection, read_list); - list_del(&con->read_list); - clear_bit(CF_READ_PENDING, &con->flags); - - spin_unlock_bh(&read_sockets_lock); - - /* This can reach zero if we are processing requests - * as they come in. - */ - if (atomic_read(&con->waiting_requests) == 0) { - spin_lock_bh(&read_sockets_lock); - continue; - } - - do { - con->rx_action(con); - - /* Don't starve out everyone else */ - if (++count >= MAX_RX_MSG_COUNT) { - cond_resched(); - count = 0; - } + struct connection *con = container_of(work, struct connection, rwork); + int err; - } while (!atomic_dec_and_test(&con->waiting_requests) && - !kthread_should_stop()); - - spin_lock_bh(&read_sockets_lock); - } - spin_unlock_bh(&read_sockets_lock); + clear_bit(CF_READ_PENDING, &con->flags); + do { + err = con->rx_action(con); + } while (!err); } -/* Try to send any messages that are pending - */ -static void process_output_queue(void) -{ - struct list_head *list; - struct list_head *temp; - - spin_lock_bh(&write_sockets_lock); - list_for_each_safe(list, temp, &write_sockets) { - struct connection *con = - list_entry(list, struct connection, write_list); - clear_bit(CF_WRITE_PENDING, &con->flags); - list_del(&con->write_list); - spin_unlock_bh(&write_sockets_lock); - send_to_sock(con); - spin_lock_bh(&write_sockets_lock); - } - spin_unlock_bh(&write_sockets_lock); -} - -static void process_state_queue(void) +static void process_send_sockets(struct work_struct *work) { - struct list_head *list; - struct list_head *temp; - - spin_lock_bh(&state_sockets_lock); - list_for_each_safe(list, temp, &state_sockets) { - struct connection *con = - list_entry(list, struct connection, state_list); - list_del(&con->state_list); - clear_bit(CF_CONNECT_PENDING, &con->flags); - spin_unlock_bh(&state_sockets_lock); + struct connection *con = container_of(work, struct connection, swork); + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { connect_to_sock(con); - spin_lock_bh(&state_sockets_lock); } - spin_unlock_bh(&state_sockets_lock); + + clear_bit(CF_WRITE_PENDING, &con->flags); + send_to_sock(con); } @@ -992,109 +886,33 @@ static void clean_writequeues(void) } } -static int read_list_empty(void) -{ - int status; - - spin_lock_bh(&read_sockets_lock); - status = list_empty(&read_sockets); - spin_unlock_bh(&read_sockets_lock); - - return status; -} - -/* DLM Transport comms receive daemon */ -static int dlm_recvd(void *data) +static void work_stop(void) { - init_waitqueue_entry(&lowcomms_recv_waitq_head, current); - add_wait_queue(&lowcomms_recv_waitq, &lowcomms_recv_waitq_head); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (read_list_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - process_sockets(); - } - - return 0; + destroy_workqueue(recv_workqueue); + destroy_workqueue(send_workqueue); } -static int write_and_state_lists_empty(void) +static int work_start(void) { - int status; - - spin_lock_bh(&write_sockets_lock); - status = list_empty(&write_sockets); - spin_unlock_bh(&write_sockets_lock); - - spin_lock_bh(&state_sockets_lock); - if (list_empty(&state_sockets) == 0) - status = 0; - spin_unlock_bh(&state_sockets_lock); - - return status; -} - -/* DLM Transport send daemon */ -static int dlm_sendd(void *data) -{ - init_waitqueue_entry(&lowcomms_send_waitq_head, current); - add_wait_queue(&lowcomms_send_waitq, &lowcomms_send_waitq_head); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (write_and_state_lists_empty()) - cond_resched(); - set_current_state(TASK_RUNNING); - - process_state_queue(); - process_output_queue(); - } - - return 0; -} - -static void daemons_stop(void) -{ - kthread_stop(recv_task); - kthread_stop(send_task); -} - -static int daemons_start(void) -{ - struct task_struct *p; int error; - - p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); - error = IS_ERR(p); + recv_workqueue = create_workqueue("dlm_recv"); + error = IS_ERR(recv_workqueue); if (error) { - log_print("can't start dlm_recvd %d", error); + log_print("can't start dlm_recv %d", error); return error; } - recv_task = p; - p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); - error = IS_ERR(p); + send_workqueue = create_singlethread_workqueue("dlm_send"); + error = IS_ERR(send_workqueue); if (error) { - log_print("can't start dlm_sendd %d", error); - kthread_stop(recv_task); + log_print("can't start dlm_send %d", error); + destroy_workqueue(recv_workqueue); return error; } - send_task = p; return 0; } -/* - * Return the largest buffer size we can cope with. - */ -int lowcomms_max_buffer_size(void) -{ - return PAGE_CACHE_SIZE; -} - void dlm_lowcomms_stop(void) { int i; @@ -1107,7 +925,7 @@ void dlm_lowcomms_stop(void) connections[i]->flags |= 0xFF; } - daemons_stop(); + work_stop(); clean_writequeues(); for (i = 0; i < conn_array_size; i++) { @@ -1159,7 +977,7 @@ int dlm_lowcomms_start(void) if (error) goto fail_unlisten; - error = daemons_start(); + error = work_start(); if (error) goto fail_unlisten; diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 5352b03ff5a..f858fef6e41 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -76,9 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; - lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL); - if (lkb) - memset(lkb, 0, sizeof(*lkb)); + lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); return lkb; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index c9b1c3d535f..a5126e0c68a 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -82,7 +82,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, if (msglen < sizeof(struct dlm_header)) break; err = -E2BIG; - if (msglen > dlm_config.buffer_size) { + if (msglen > dlm_config.ci_buffer_size) { log_print("message size %d from %d too big, buf len %d", msglen, nodeid, len); break; @@ -103,7 +103,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, if (msglen > sizeof(__tmp) && msg == (struct dlm_header *) __tmp) { - msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); + msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (msg == NULL) return ret; } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 4cc31be9cd9..6bfbd615380 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_type = type; + spin_lock(&ls->ls_recover_lock); + rc->rc_seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + *mh_ret = mh; *rc_ret = rc; return 0; @@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *ls, struct rcom_config *rf) rf->rf_lsflags = ls->ls_exflags; } -static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) +static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { + struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; + + if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { + log_error(ls, "version mismatch: %x nodeid %d: %x", + DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, + rc->rc_header.h_version); + return -EINVAL; + } + if (rf->rf_lvblen != ls->ls_lvblen || rf->rf_lsflags != ls->ls_exflags) { log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", @@ -125,7 +138,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) goto out; allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); send_rcom(ls, mh, rc); @@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) log_debug(ls, "remote node %d not ready", nodeid); rc->rc_result = 0; } else - error = check_config(ls, (struct rcom_config *) rc->rc_buf, - nodeid); + error = check_config(ls, rc, nodeid); /* the caller looks at rc_result for the remote recovery status */ out: return error; @@ -159,6 +171,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) if (error) return; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; rc->rc_result = dlm_recover_status(ls); make_config(ls, (struct rcom_config *) rc->rc_buf); @@ -200,7 +213,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) if (nodeid == dlm_our_nodeid()) { dlm_copy_master_names(ls, last_name, last_len, ls->ls_recover_buf + len, - dlm_config.buffer_size - len, nodeid); + dlm_config.ci_buffer_size - len, nodeid); goto out; } @@ -210,7 +223,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); + memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); send_rcom(ls, mh, rc); @@ -224,30 +237,17 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, inlen, outlen; - int nodeid = rc_in->rc_header.h_nodeid; - uint32_t status = dlm_recover_status(ls); - - /* - * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while - * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes). - * It could only happen in rare cases where we get a late NAMES - * message from a previous instance of recovery. - */ - - if (!(status & DLM_RS_NODES)) { - log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid); - return; - } + int error, inlen, outlen, nodeid; nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); + outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); if (error) return; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); @@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) ret_nodeid = error; rc->rc_result = ret_nodeid; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; send_rcom(ls, mh, rc); } @@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; send_rcom(ls, mh, rc); } static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) { - uint32_t status = dlm_recover_status(ls); - - if (!(status & DLM_RS_DIR)) { - log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u", - rc_in->rc_header.h_nodeid); - return; - } - dlm_recover_process_copy(ls, rc_in); } @@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc->rc_type = DLM_RCOM_STATUS_REPLY; rc->rc_id = rc_in->rc_id; + rc->rc_seq_reply = rc_in->rc_seq; rc->rc_result = -ESRCH; rf = (struct rcom_config *) rc->rc_buf; @@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) return 0; } +static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + uint64_t seq; + int rv = 0; + + switch (rc->rc_type) { + case DLM_RCOM_STATUS_REPLY: + case DLM_RCOM_NAMES_REPLY: + case DLM_RCOM_LOOKUP_REPLY: + case DLM_RCOM_LOCK_REPLY: + spin_lock(&ls->ls_recover_lock); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); + if (rc->rc_seq_reply != seq) { + log_debug(ls, "ignoring old reply %x from %d " + "seq_reply %llx expect %llx", + rc->rc_type, rc->rc_header.h_nodeid, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq); + rv = 1; + } + } + return rv; +} + /* Called by dlm_recvd; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ @@ -449,11 +469,14 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) } if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { - log_error(ls, "ignoring recovery message %x from %d", + log_debug(ls, "ignoring recovery message %x from %d", rc->rc_type, nodeid); goto out; } + if (is_old_reply(ls, rc)) + goto out; + if (nodeid != rc->rc_header.h_nodeid) { log_error(ls, "bad rcom nodeid %d from %d", rc->rc_header.h_nodeid, nodeid); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index cf9f6831bab..c2cc7694cd1 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -44,7 +44,7 @@ static void dlm_wait_timer_fn(unsigned long data) { struct dlm_ls *ls = (struct dlm_ls *) data; - mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ)); + mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ)); wake_up(&ls->ls_wait_general); } @@ -55,7 +55,7 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) init_timer(&ls->ls_timer); ls->ls_timer.function = dlm_wait_timer_fn; ls->ls_timer.data = (long) ls; - ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ); + ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ); add_timer(&ls->ls_timer); wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls)); @@ -397,7 +397,9 @@ int dlm_recover_masters(struct dlm_ls *ls) if (dlm_no_directory(ls)) count += recover_master_static(r); - else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) { + else if (!is_master(r) && + (dlm_is_removed(ls, r->res_nodeid) || + rsb_flag(r, RSB_NEW_MASTER))) { recover_master(r); count++; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 650536aa513..3cb636d6024 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -77,7 +77,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members(ls, rv, &neg); if (error) { - log_error(ls, "recover_members failed %d", error); + log_debug(ls, "recover_members failed %d", error); goto fail; } start = jiffies; @@ -89,7 +89,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_error(ls, "recover_directory failed %d", error); + log_debug(ls, "recover_directory failed %d", error); goto fail; } @@ -99,7 +99,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory_wait(ls); if (error) { - log_error(ls, "recover_directory_wait failed %d", error); + log_debug(ls, "recover_directory_wait failed %d", error); goto fail; } @@ -129,7 +129,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_error(ls, "recover_masters failed %d", error); + log_debug(ls, "recover_masters failed %d", error); goto fail; } @@ -139,13 +139,13 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_error(ls, "recover_locks failed %d", error); + log_debug(ls, "recover_locks failed %d", error); goto fail; } error = dlm_recover_locks_wait(ls); if (error) { - log_error(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "recover_locks_wait failed %d", error); goto fail; } @@ -166,7 +166,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_error(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "recover_locks_wait failed %d", error); goto fail; } } @@ -184,7 +184,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DONE); error = dlm_recover_done_wait(ls); if (error) { - log_error(ls, "recover_done_wait failed %d", error); + log_debug(ls, "recover_done_wait failed %d", error); goto fail; } @@ -192,19 +192,19 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_error(ls, "enable_locking failed %d", error); + log_debug(ls, "enable_locking failed %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_error(ls, "process_requestqueue failed %d", error); + log_debug(ls, "process_requestqueue failed %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_error(ls, "recover_waiters_post failed %d", error); + log_debug(ls, "recover_waiters_post failed %d", error); goto fail; } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index c37e93e4f2d..40db61dc95f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -25,7 +25,7 @@ static const char *name_prefix="dlm"; static struct miscdevice ctl_device; -static struct file_operations device_fops; +static const struct file_operations device_fops; #ifdef CONFIG_COMPAT @@ -180,6 +180,14 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue)) remove_ownqueue = 1; + /* unlocks or cancels of waiting requests need to be removed from the + proc's unlocking list, again there must be a better way... */ + + if (ua->lksb.sb_status == -DLM_EUNLOCK || + (ua->lksb.sb_status == -DLM_ECANCEL && + lkb->lkb_grmode == DLM_LOCK_IV)) + remove_ownqueue = 1; + /* We want to copy the lvb to userspace when the completion ast is read if the status is 0, the lock has an lvb and lvb_ops says we should. We could probably have set_lvb_lock() @@ -523,6 +531,7 @@ static int device_open(struct inode *inode, struct file *file) proc->lockspace = ls->ls_local_handle; INIT_LIST_HEAD(&proc->asts); INIT_LIST_HEAD(&proc->locks); + INIT_LIST_HEAD(&proc->unlocking); spin_lock_init(&proc->asts_spin); spin_lock_init(&proc->locks_spin); init_waitqueue_head(&proc->wait); @@ -750,7 +759,7 @@ static int ctl_device_close(struct inode *inode, struct file *file) return 0; } -static struct file_operations device_fops = { +static const struct file_operations device_fops = { .open = device_open, .release = device_close, .read = device_read, @@ -759,7 +768,7 @@ static struct file_operations device_fops = { .owner = THIS_MODULE, }; -static struct file_operations ctl_device_fops = { +static const struct file_operations ctl_device_fops = { .open = ctl_device_open, .release = ctl_device_close, .write = device_write, diff --git a/fs/dlm/util.c b/fs/dlm/util.c index 767197db994..963889cf674 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc) rc->rc_type = cpu_to_le32(rc->rc_type); rc->rc_result = cpu_to_le32(rc->rc_result); rc->rc_id = cpu_to_le64(rc->rc_id); + rc->rc_seq = cpu_to_le64(rc->rc_seq); + rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); if (type == DLM_RCOM_LOCK) rcom_lock_out((struct rcom_lock *) rc->rc_buf); @@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc) rc->rc_type = le32_to_cpu(rc->rc_type); rc->rc_result = le32_to_cpu(rc->rc_result); rc->rc_id = le64_to_cpu(rc->rc_id); + rc->rc_seq = le64_to_cpu(rc->rc_seq); + rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); if (rc->rc_type == DLM_RCOM_LOCK) rcom_lock_in((struct rcom_lock *) rc->rc_buf); diff --git a/fs/dquot.c b/fs/dquot.c index 0952cc474d9..b16f991662c 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -79,6 +79,7 @@ #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> +#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ #include <asm/uaccess.h> @@ -600,11 +601,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; - dquot = kmem_cache_alloc(dquot_cachep, GFP_NOFS); + dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); if(!dquot) return NODQUOT; - memset((caddr_t)dquot, 0, sizeof(struct dquot)); mutex_init(&dquot->dq_lock); INIT_LIST_HEAD(&dquot->dq_free); INIT_LIST_HEAD(&dquot->dq_inuse); @@ -688,23 +688,27 @@ static int dqinit_needed(struct inode *inode, int type) /* This routine is guarded by dqonoff_mutex mutex */ static void add_dquot_ref(struct super_block *sb, int type) { - struct list_head *p; + struct inode *inode; restart: - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *filp = list_entry(p, struct file, f_u.fu_list); - struct inode *inode = filp->f_path.dentry->d_inode; - if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { - struct dentry *dentry = dget(filp->f_path.dentry); - file_list_unlock(); - sb->dq_op->initialize(inode, type); - dput(dentry); - /* As we may have blocked we had better restart... */ - goto restart; - } + spin_lock(&inode_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!atomic_read(&inode->i_writecount)) + continue; + if (!dqinit_needed(inode, type)) + continue; + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + + __iget(inode); + spin_unlock(&inode_lock); + + sb->dq_op->initialize(inode, type); + iput(inode); + /* As we may have blocked we had better restart... */ + goto restart; } - file_list_unlock(); + spin_unlock(&inode_lock); } /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */ @@ -756,15 +760,30 @@ static void put_dquot_list(struct list_head *tofree_head) } } +static void remove_dquot_ref(struct super_block *sb, int type, + struct list_head *tofree_head) +{ + struct inode *inode; + + spin_lock(&inode_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!IS_NOQUOTA(inode)) + remove_inode_dquot_ref(inode, type, tofree_head); + } + spin_unlock(&inode_lock); +} + /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { LIST_HEAD(tofree_head); - down_write(&sb_dqopt(sb)->dqptr_sem); - remove_dquot_ref(sb, type, &tofree_head); - up_write(&sb_dqopt(sb)->dqptr_sem); - put_dquot_list(&tofree_head); + if (sb->dq_op) { + down_write(&sb_dqopt(sb)->dqptr_sem); + remove_dquot_ref(sb, type, &tofree_head); + up_write(&sb_dqopt(sb)->dqptr_sem); + put_dquot_list(&tofree_head); + } } static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) @@ -1822,7 +1841,7 @@ static int __init dquot_init(void) printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); - register_sysctl_table(sys_table, 0); + register_sysctl_table(sys_table); dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 4e4762389bd..03ea7696fe3 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_WILL_FREE)) continue; - invalidate_inode_pages(inode->i_mapping); + invalidate_mapping_pages(inode->i_mapping, 0, -1); } spin_unlock(&inode_lock); } diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index ca6562451ee..1f1107237ea 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o messaging.o netlink.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 7196f50fe15..6ac630625b7 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -3,7 +3,7 @@ * * Copyright (C) 1997-2004 Erez Zadok * Copyright (C) 2001-2004 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * @@ -207,7 +207,7 @@ ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); mutex_init(&crypt_stat->cs_hash_tfm_mutex); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED); + crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; } /** @@ -305,8 +305,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, int rc = 0; BUG_ON(!crypt_stat || !crypt_stat->tfm - || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ECRYPTFS_STRUCT_INITIALIZED)); + || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", crypt_stat->key_size); @@ -429,10 +428,10 @@ static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx, goto out; } } else { - rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, - lower_inode, - lower_page_idx); - if (rc) { + *lower_page = grab_cache_page(lower_inode->i_mapping, + lower_page_idx); + if (!(*lower_page)) { + rc = -EINVAL; ecryptfs_printk( KERN_ERR, "Error attempting to grab and map " "lower page with index [0x%.16x]; rc = [%d]\n", @@ -485,7 +484,7 @@ int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx) lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host); inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host); crypt_stat = &inode_info->crypt_stat; - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) { + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode, ctx->param.lower_file); if (rc) @@ -617,7 +616,7 @@ int ecryptfs_decrypt_page(struct file *file, struct page *page) crypt_stat = &(ecryptfs_inode_to_private( page->mapping->host)->crypt_stat); lower_inode = ecryptfs_inode_to_lower(page->mapping->host); - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) { + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_do_readpage(file, page, page->index); if (rc) ecryptfs_printk(KERN_ERR, "Error attempting to copy " @@ -828,9 +827,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) mutex_unlock(&crypt_stat->cs_tfm_mutex); goto out; } - crypto_blkcipher_set_flags(crypt_stat->tfm, - (ECRYPTFS_DEFAULT_CHAINING_MODE - | CRYPTO_TFM_REQ_WEAK_KEY)); + crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); mutex_unlock(&crypt_stat->cs_tfm_mutex); rc = 0; out: @@ -865,7 +862,10 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; } else crypt_stat->header_extent_size = PAGE_CACHE_SIZE; - crypt_stat->num_header_extents_at_front = 1; + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + crypt_stat->num_header_extents_at_front = 0; + else + crypt_stat->num_header_extents_at_front = 1; } /** @@ -881,7 +881,7 @@ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); BUG_ON(crypt_stat->iv_bytes <= 0); - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) { + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Session key not valid; " "cannot generate root IV\n"); @@ -898,8 +898,7 @@ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) out: if (rc) { memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); - ECRYPTFS_SET_FLAG(crypt_stat->flags, - ECRYPTFS_SECURITY_WARNING); + crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; } return rc; } @@ -907,7 +906,7 @@ out: static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) { get_random_bytes(crypt_stat->key, crypt_stat->key_size); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; ecryptfs_compute_root_iv(crypt_stat); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n"); @@ -917,6 +916,22 @@ static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) } /** + * ecryptfs_copy_mount_wide_flags_to_inode_flags + * + * This function propagates the mount-wide flags to individual inode + * flags. + */ +static void ecryptfs_copy_mount_wide_flags_to_inode_flags( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + if (mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; +} + +/** * ecryptfs_set_default_crypt_stat_vals * @crypt_stat * @@ -926,10 +941,12 @@ static void ecryptfs_set_default_crypt_stat_vals( struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); ecryptfs_set_default_sizes(crypt_stat); strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER); crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES; - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + crypt_stat->flags &= ~(ECRYPTFS_KEY_VALID); crypt_stat->file_version = ECRYPTFS_FILE_VERSION; crypt_stat->mount_crypt_stat = mount_crypt_stat; } @@ -969,8 +986,10 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) if (mount_crypt_stat->global_auth_tok) { ecryptfs_printk(KERN_DEBUG, "Initializing context for new " "file using mount_crypt_stat\n"); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++], mount_crypt_stat->global_auth_tok_sig, ECRYPTFS_SIG_SIZE_HEX); @@ -1003,7 +1022,7 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) * * Returns one if marker found; zero if not found */ -int contains_ecryptfs_marker(char *data) +static int contains_ecryptfs_marker(char *data) { u32 m_1, m_2; @@ -1029,7 +1048,8 @@ struct ecryptfs_flag_map_elem { /* Add support for additional flags by adding elements here. */ static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { {0x00000001, ECRYPTFS_ENABLE_HMAC}, - {0x00000002, ECRYPTFS_ENCRYPTED} + {0x00000002, ECRYPTFS_ENCRYPTED}, + {0x00000004, ECRYPTFS_METADATA_IN_XATTR} }; /** @@ -1052,11 +1072,9 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, for (i = 0; i < ((sizeof(ecryptfs_flag_map) / sizeof(struct ecryptfs_flag_map_elem))); i++) if (flags & ecryptfs_flag_map[i].file_flag) { - ECRYPTFS_SET_FLAG(crypt_stat->flags, - ecryptfs_flag_map[i].local_flag); + crypt_stat->flags |= ecryptfs_flag_map[i].local_flag; } else - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, - ecryptfs_flag_map[i].local_flag); + crypt_stat->flags &= ~(ecryptfs_flag_map[i].local_flag); /* Version is in top 8 bits of the 32-bit flag vector */ crypt_stat->file_version = ((flags >> 24) & 0xFF); (*bytes_read) = 4; @@ -1093,8 +1111,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, for (i = 0; i < ((sizeof(ecryptfs_flag_map) / sizeof(struct ecryptfs_flag_map_elem))); i++) - if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ecryptfs_flag_map[i].local_flag)) + if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag) flags |= ecryptfs_flag_map[i].file_flag; /* Version is in top 8 bits of the 32-bit flag vector */ flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); @@ -1189,8 +1206,8 @@ int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code) * * Returns zero on success; non-zero otherwise */ -int ecryptfs_read_header_region(char *data, struct dentry *dentry, - struct vfsmount *mnt) +static int ecryptfs_read_header_region(char *data, struct dentry *dentry, + struct vfsmount *mnt) { struct file *lower_file; mm_segment_t oldfs; @@ -1219,9 +1236,25 @@ out: return rc; } -static void -write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, - size_t *written) +int ecryptfs_read_and_validate_header_region(char *data, struct dentry *dentry, + struct vfsmount *mnt) +{ + int rc; + + rc = ecryptfs_read_header_region(data, dentry, mnt); + if (rc) + goto out; + if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) + rc = -EINVAL; +out: + return rc; +} + + +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) { u32 header_extent_size; u16 num_header_extents_at_front; @@ -1270,9 +1303,9 @@ struct kmem_cache *ecryptfs_header_cache_2; * * Returns zero on success */ -int ecryptfs_write_headers_virt(char *page_virt, - struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry) +static int ecryptfs_write_headers_virt(char *page_virt, size_t *size, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry) { int rc; size_t written; @@ -1283,7 +1316,8 @@ int ecryptfs_write_headers_virt(char *page_virt, offset += written; write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); offset += written; - write_header_metadata((page_virt + offset), crypt_stat, &written); + ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, + &written); offset += written; rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat, ecryptfs_dentry, &written, @@ -1291,11 +1325,70 @@ int ecryptfs_write_headers_virt(char *page_virt, if (rc) ecryptfs_printk(KERN_WARNING, "Error generating key packet " "set; rc = [%d]\n", rc); + if (size) { + offset += written; + *size = offset; + } + return rc; +} + +static int ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, + struct file *lower_file, + char *page_virt) +{ + mm_segment_t oldfs; + int current_header_page; + int header_pages; + ssize_t size; + int rc = 0; + + lower_file->f_pos = 0; + oldfs = get_fs(); + set_fs(get_ds()); + size = vfs_write(lower_file, (char __user *)page_virt, PAGE_CACHE_SIZE, + &lower_file->f_pos); + if (size < 0) { + rc = (int)size; + printk(KERN_ERR "Error attempting to write lower page; " + "rc = [%d]\n", rc); + set_fs(oldfs); + goto out; + } + header_pages = ((crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front) + / PAGE_CACHE_SIZE); + memset(page_virt, 0, PAGE_CACHE_SIZE); + current_header_page = 1; + while (current_header_page < header_pages) { + size = vfs_write(lower_file, (char __user *)page_virt, + PAGE_CACHE_SIZE, &lower_file->f_pos); + if (size < 0) { + rc = (int)size; + printk(KERN_ERR "Error attempting to write lower page; " + "rc = [%d]\n", rc); + set_fs(oldfs); + goto out; + } + current_header_page++; + } + set_fs(oldfs); +out: + return rc; +} + +static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + char *page_virt, size_t size) +{ + int rc; + + rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, + size, 0); return rc; } /** - * ecryptfs_write_headers + * ecryptfs_write_metadata * @lower_file: The lower file struct, which was returned from dentry_open * * Write the file headers out. This will likely involve a userspace @@ -1306,22 +1399,18 @@ int ecryptfs_write_headers_virt(char *page_virt, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_headers(struct dentry *ecryptfs_dentry, - struct file *lower_file) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct file *lower_file) { - mm_segment_t oldfs; struct ecryptfs_crypt_stat *crypt_stat; char *page_virt; - int current_header_page; - int header_pages; + size_t size; int rc = 0; crypt_stat = &ecryptfs_inode_to_private( ecryptfs_dentry->d_inode)->crypt_stat; - if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ECRYPTFS_ENCRYPTED))) { - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ECRYPTFS_KEY_VALID)) { + if (likely(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { ecryptfs_printk(KERN_DEBUG, "Key is " "invalid; bailing out\n"); rc = -EINVAL; @@ -1334,54 +1423,42 @@ int ecryptfs_write_headers(struct dentry *ecryptfs_dentry, goto out; } /* Released in this function */ - page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, GFP_USER); + page_virt = kmem_cache_zalloc(ecryptfs_header_cache_0, GFP_USER); if (!page_virt) { ecryptfs_printk(KERN_ERR, "Out of memory\n"); rc = -ENOMEM; goto out; } - memset(page_virt, 0, PAGE_CACHE_SIZE); - rc = ecryptfs_write_headers_virt(page_virt, crypt_stat, - ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(page_virt, &size, crypt_stat, + ecryptfs_dentry); if (unlikely(rc)) { ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n"); memset(page_virt, 0, PAGE_CACHE_SIZE); goto out_free; } - ecryptfs_printk(KERN_DEBUG, - "Writing key packet set to underlying file\n"); - lower_file->f_pos = 0; - oldfs = get_fs(); - set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->" - "write() w/ header page; lower_file->f_pos = " - "[0x%.16x]\n", lower_file->f_pos); - lower_file->f_op->write(lower_file, (char __user *)page_virt, - PAGE_CACHE_SIZE, &lower_file->f_pos); - header_pages = ((crypt_stat->header_extent_size - * crypt_stat->num_header_extents_at_front) - / PAGE_CACHE_SIZE); - memset(page_virt, 0, PAGE_CACHE_SIZE); - current_header_page = 1; - while (current_header_page < header_pages) { - ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->" - "write() w/ zero'd page; lower_file->f_pos = " - "[0x%.16x]\n", lower_file->f_pos); - lower_file->f_op->write(lower_file, (char __user *)page_virt, - PAGE_CACHE_SIZE, &lower_file->f_pos); - current_header_page++; + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, + crypt_stat, page_virt, + size); + else + rc = ecryptfs_write_metadata_to_contents(crypt_stat, lower_file, + page_virt); + if (rc) { + printk(KERN_ERR "Error writing metadata out to lower file; " + "rc = [%d]\n", rc); + goto out_free; } - set_fs(oldfs); - ecryptfs_printk(KERN_DEBUG, - "Done writing key packet set to underlying file.\n"); out_free: kmem_cache_free(ecryptfs_header_cache_0, page_virt); out: return rc; } +#define ECRYPTFS_DONT_VALIDATE_HEADER_SIZE 0 +#define ECRYPTFS_VALIDATE_HEADER_SIZE 1 static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, - char *virt, int *bytes_read) + char *virt, int *bytes_read, + int validate_header_size) { int rc = 0; u32 header_extent_size; @@ -1396,9 +1473,10 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->num_header_extents_at_front = (int)num_header_extents_at_front; (*bytes_read) = 6; - if ((crypt_stat->header_extent_size - * crypt_stat->num_header_extents_at_front) - < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) { + if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) + && ((crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front) + < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Invalid header extent size: " "[%d]\n", crypt_stat->header_extent_size); @@ -1429,7 +1507,8 @@ static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) */ static int ecryptfs_read_headers_virt(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry) + struct dentry *ecryptfs_dentry, + int validate_header_size) { int rc = 0; int offset; @@ -1463,7 +1542,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, offset += bytes_read; if (crypt_stat->file_version >= 1) { rc = parse_header_metadata(crypt_stat, (page_virt + offset), - &bytes_read); + &bytes_read, validate_header_size); if (rc) { ecryptfs_printk(KERN_WARNING, "Error reading header " "metadata; rc = [%d]\n", rc); @@ -1478,12 +1557,60 @@ out: } /** - * ecryptfs_read_headers + * ecryptfs_read_xattr_region + * + * Attempts to read the crypto metadata from the extended attribute + * region of the lower file. + */ +int ecryptfs_read_xattr_region(char *page_virt, struct dentry *ecryptfs_dentry) +{ + ssize_t size; + int rc = 0; + + size = ecryptfs_getxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, + page_virt, ECRYPTFS_DEFAULT_EXTENT_SIZE); + if (size < 0) { + printk(KERN_DEBUG "Error attempting to read the [%s] " + "xattr from the lower file; return value = [%zd]\n", + ECRYPTFS_XATTR_NAME, size); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +int ecryptfs_read_and_validate_xattr_region(char *page_virt, + struct dentry *ecryptfs_dentry) +{ + int rc; + + rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry); + if (rc) + goto out; + if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) { + printk(KERN_WARNING "Valid data found in [%s] xattr, but " + "the marker is invalid\n", ECRYPTFS_XATTR_NAME); + rc = -EINVAL; + } +out: + return rc; +} + +/** + * ecryptfs_read_metadata + * + * Common entry point for reading file metadata. From here, we could + * retrieve the header information from the header region of the file, + * the xattr region of the file, or some other repostory that is + * stored separately from the file itself. The current implementation + * supports retrieving the metadata information from the file contents + * and from the xattr region. * * Returns zero if valid headers found and parsed; non-zero otherwise */ -int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, - struct file *lower_file) +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry, + struct file *lower_file) { int rc = 0; char *page_virt = NULL; @@ -1491,7 +1618,12 @@ int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, ssize_t bytes_read; struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, + mount_crypt_stat); /* Read the first page from the underlying file */ page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER); if (!page_virt) { @@ -1512,11 +1644,36 @@ int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, goto out; } rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, - ecryptfs_dentry); + ecryptfs_dentry, + ECRYPTFS_VALIDATE_HEADER_SIZE); if (rc) { - ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not " - "found\n"); - rc = -EINVAL; + rc = ecryptfs_read_xattr_region(page_virt, + ecryptfs_dentry); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file header region or xattr region\n"); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry, + ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); + if (rc) { + printk(KERN_DEBUG "Valid eCryptfs headers not found in " + "file xattr region either\n"); + rc = -EINVAL; + } + if (crypt_stat->mount_crypt_stat->flags + & ECRYPTFS_XATTR_METADATA_ENABLED) { + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } else { + printk(KERN_WARNING "Attempt to access file with " + "crypto metadata only in the extended attribute " + "region, but eCryptfs was mounted without " + "xattr support enabled. eCryptfs will not treat " + "this like an encrypted file.\n"); + rc = -EINVAL; + } } out: if (page_virt) { diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c index 61f8e894284..434c7efd80f 100644 --- a/fs/ecryptfs/debug.c +++ b/fs/ecryptfs/debug.c @@ -36,7 +36,7 @@ void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n", auth_tok); - if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) { + if (auth_tok->flags & ECRYPTFS_PRIVATE_KEY) { ecryptfs_printk(KERN_DEBUG, " * private key type\n"); ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT " "IN ECRYPTFS VERSION 0.1)\n"); @@ -46,8 +46,8 @@ void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) ECRYPTFS_SALT_SIZE); salt[ECRYPTFS_SALT_SIZE * 2] = '\0'; ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt); - if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags, - ECRYPTFS_PERSISTENT_PASSWORD)) { + if (auth_tok->token.password.flags & + ECRYPTFS_PERSISTENT_PASSWORD) { ecryptfs_printk(KERN_DEBUG, " * persistent\n"); } memcpy(sig, auth_tok->token.password.signature, diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index afb64bdbe6a..403e3bad145 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -4,8 +4,10 @@ * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * Trevor S. Highland <trevor.highland@gmail.com> + * Tyler Hicks <tyhicks@ou.edu> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -31,22 +33,25 @@ #include <linux/fs_stack.h> #include <linux/namei.h> #include <linux/scatterlist.h> +#include <linux/hash.h> /* Version verification for shared data structures w/ userspace */ #define ECRYPTFS_VERSION_MAJOR 0x00 #define ECRYPTFS_VERSION_MINOR 0x04 -#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01 +#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x02 /* These flags indicate which features are supported by the kernel * module; userspace tools such as the mount helper read * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine * how to behave. */ -#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 -#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 +#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 +#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 #define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 -#define ECRYPTFS_VERSIONING_POLICY 0x00000008 +#define ECRYPTFS_VERSIONING_POLICY 0x00000008 +#define ECRYPTFS_VERSIONING_XATTR 0x00000010 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH) - + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_PUBKEY \ + | ECRYPTFS_VERSIONING_XATTR) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 @@ -60,10 +65,25 @@ #define ECRYPTFS_MAX_KEY_BYTES 64 #define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 #define ECRYPTFS_DEFAULT_IV_BYTES 16 -#define ECRYPTFS_FILE_VERSION 0x01 +#define ECRYPTFS_FILE_VERSION 0x02 #define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 +#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 +#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ +#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) +#define ECRYPTFS_NLMSG_HELO 100 +#define ECRYPTFS_NLMSG_QUIT 101 +#define ECRYPTFS_NLMSG_REQUEST 102 +#define ECRYPTFS_NLMSG_RESPONSE 103 +#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 +#define ECRYPTFS_DEFAULT_NUM_USERS 4 +#define ECRYPTFS_MAX_NUM_USERS 32768 +#define ECRYPTFS_TRANSPORT_NETLINK 0 +#define ECRYPTFS_TRANSPORT_CONNECTOR 1 +#define ECRYPTFS_TRANSPORT_RELAYFS 2 +#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_NETLINK +#define ECRYPTFS_XATTR_NAME "user.ecryptfs" #define RFC2440_CIPHER_DES3_EDE 0x02 #define RFC2440_CIPHER_CAST_5 0x03 @@ -74,9 +94,7 @@ #define RFC2440_CIPHER_TWOFISH 0x0a #define RFC2440_CIPHER_CAST_6 0x0b -#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag)) -#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag)) -#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag)) +#define RFC2440_CIPHER_RSA 0x01 /** * For convenience, we may need to pass around the encrypted session @@ -114,6 +132,14 @@ struct ecryptfs_password { enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; +struct ecryptfs_private_key { + u32 key_size; + u32 data_len; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; + u8 data[]; +}; + /* May be a password or a private key */ struct ecryptfs_auth_tok { u16 version; /* 8-bit major and 8-bit minor */ @@ -123,7 +149,7 @@ struct ecryptfs_auth_tok { u8 reserved[32]; union { struct ecryptfs_password password; - /* Private key is in future eCryptfs releases */ + struct ecryptfs_private_key private_key; } token; } __attribute__ ((packed)); @@ -176,10 +202,14 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_FILE_SIZE_BYTES 8 #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC #define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED +#define ECRYPTFS_TAG_64_PACKET_TYPE 0x40 +#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 +#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 +#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 #define MD5_DIGEST_SIZE 16 /** @@ -196,6 +226,8 @@ struct ecryptfs_crypt_stat { #define ECRYPTFS_ENABLE_HMAC 0x00000020 #define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 #define ECRYPTFS_KEY_VALID 0x00000080 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -242,6 +274,8 @@ struct ecryptfs_dentry_info { struct ecryptfs_mount_crypt_stat { /* Pointers to memory we do not own, do not free these */ #define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001 +#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 +#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 u32 flags; struct ecryptfs_auth_tok *global_auth_tok; struct key *global_auth_tok_key; @@ -272,6 +306,33 @@ struct ecryptfs_auth_tok_list_item { struct ecryptfs_auth_tok auth_tok; }; +struct ecryptfs_message { + u32 index; + u32 data_len; + u8 data[]; +}; + +struct ecryptfs_msg_ctx { +#define ECRYPTFS_MSG_CTX_STATE_FREE 0x0001 +#define ECRYPTFS_MSG_CTX_STATE_PENDING 0x0002 +#define ECRYPTFS_MSG_CTX_STATE_DONE 0x0003 + u32 state; + unsigned int index; + unsigned int counter; + struct ecryptfs_message *msg; + struct task_struct *task; + struct list_head node; + struct mutex mux; +}; + +extern unsigned int ecryptfs_transport; + +struct ecryptfs_daemon_id { + pid_t pid; + uid_t uid; + struct hlist_node id_chain; +}; + static inline struct ecryptfs_file_info * ecryptfs_file_to_private(struct file *file) { @@ -385,13 +446,16 @@ void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; extern const struct file_operations ecryptfs_dir_fops; -extern struct inode_operations ecryptfs_main_iops; -extern struct inode_operations ecryptfs_dir_iops; -extern struct inode_operations ecryptfs_symlink_iops; -extern struct super_operations ecryptfs_sops; +extern const struct inode_operations ecryptfs_main_iops; +extern const struct inode_operations ecryptfs_dir_iops; +extern const struct inode_operations ecryptfs_symlink_iops; +extern const struct super_operations ecryptfs_sops; extern struct dentry_operations ecryptfs_dops; extern struct address_space_operations ecryptfs_aops; extern int ecryptfs_verbosity; +extern unsigned int ecryptfs_message_buf_len; +extern signed long ecryptfs_message_wait_timeout; +extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; @@ -401,7 +465,9 @@ extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache_0; extern struct kmem_cache *ecryptfs_header_cache_1; extern struct kmem_cache *ecryptfs_header_cache_2; +extern struct kmem_cache *ecryptfs_xattr_cache; extern struct kmem_cache *ecryptfs_lower_page_cache; +extern struct kmem_cache *ecryptfs_key_record_cache; int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, @@ -427,9 +493,13 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, char *cipher_name, char *chaining_modifier); -int ecryptfs_write_inode_size_to_header(struct file *lower_file, - struct inode *lower_inode, - struct inode *inode); +#define ECRYPTFS_LOWER_I_MUTEX_NOT_HELD 0 +#define ECRYPTFS_LOWER_I_MUTEX_HELD 1 +int ecryptfs_write_inode_size_to_metadata(struct file *lower_file, + struct inode *lower_inode, + struct inode *inode, + struct dentry *ecryptfs_dentry, + int lower_i_mutex_held); int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, struct file *lower_file, unsigned long lower_page_index, int byte_offset, @@ -442,26 +512,20 @@ int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode, struct file *lower_file); int ecryptfs_do_readpage(struct file *file, struct page *page, pgoff_t lower_page_index); -int ecryptfs_grab_and_map_lower_page(struct page **lower_page, - char **lower_virt, - struct inode *lower_inode, - unsigned long lower_page_index); int ecryptfs_writepage_and_release_lower_page(struct page *lower_page, struct inode *lower_inode, struct writeback_control *wbc); int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx); int ecryptfs_decrypt_page(struct file *file, struct page *page); -int ecryptfs_write_headers(struct dentry *ecryptfs_dentry, +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct file *lower_file); +int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry, struct file *lower_file); -int ecryptfs_write_headers_virt(char *page_virt, - struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry); -int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, - struct file *lower_file); int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); -int contains_ecryptfs_marker(char *data); -int ecryptfs_read_header_region(char *data, struct dentry *dentry, - struct vfsmount *mnt); +int ecryptfs_read_and_validate_header_region(char *data, struct dentry *dentry, + struct vfsmount *mnt); +int ecryptfs_read_and_validate_xattr_region(char *page_virt, + struct dentry *ecryptfs_dentry); u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); @@ -484,5 +548,37 @@ int ecryptfs_open_lower_file(struct file **lower_file, struct dentry *lower_dentry, struct vfsmount *lower_mnt, int flags); int ecryptfs_close_lower_file(struct file *lower_file); +ssize_t ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size); +int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int ecryptfs_read_xattr_region(char *page_virt, struct dentry *ecryptfs_dentry); +int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid); +int ecryptfs_process_quit(uid_t uid, pid_t pid); +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid, + pid_t pid, u32 seq); +int ecryptfs_send_message(unsigned int transport, char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx); +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg); +int ecryptfs_init_messaging(unsigned int transport); +void ecryptfs_release_messaging(unsigned int transport); + +int ecryptfs_send_netlink(char *data, int data_len, + struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, + u16 msg_flags, pid_t daemon_pid); +int ecryptfs_init_netlink(void); +void ecryptfs_release_netlink(void); + +int ecryptfs_send_connector(char *data, int data_len, + struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, + u16 msg_flags, pid_t daemon_pid); +int ecryptfs_init_connector(void); +void ecryptfs_release_connector(void); +void +ecryptfs_write_header_metadata(char *virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c5a2e5298f1..bd969adf70d 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -3,7 +3,7 @@ * * Copyright (C) 1997-2004 Erez Zadok * Copyright (C) 2001-2004 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * @@ -250,8 +250,19 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct ecryptfs_file_info *file_info; int lower_flags; + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) + || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) + || (file->f_flags & O_APPEND))) { + printk(KERN_WARNING "Mount has encrypted view enabled; " + "files may only be read\n"); + rc = -EPERM; + goto out; + } /* Released in ecryptfs_release or end of function if failure */ - file_info = kmem_cache_alloc(ecryptfs_file_info_cache, GFP_KERNEL); + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); ecryptfs_set_file_private(file, file_info); if (!file_info) { ecryptfs_printk(KERN_ERR, @@ -259,17 +270,14 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -ENOMEM; goto out; } - memset(file_info, 0, sizeof(*file_info)); lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; mutex_lock(&crypt_stat->cs_mutex); - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) { + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n"); /* Policy code enabled in future release */ - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + crypt_stat->flags |= ECRYPTFS_POLICY_APPLIED; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; } mutex_unlock(&crypt_stat->cs_mutex); lower_flags = file->f_flags; @@ -289,31 +297,14 @@ static int ecryptfs_open(struct inode *inode, struct file *file) lower_inode = lower_dentry->d_inode; if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); rc = 0; goto out; } mutex_lock(&crypt_stat->cs_mutex); - if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) { - if (!(mount_crypt_stat->flags - & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { - rc = -EIO; - printk(KERN_WARNING "Attempt to read file that is " - "not in a valid eCryptfs format, and plaintext " - "passthrough mode is not enabled; returning " - "-EIO\n"); - mutex_unlock(&crypt_stat->cs_mutex); - goto out_puts; - } - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - rc = 0; - mutex_unlock(&crypt_stat->cs_mutex); - goto out; - } else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ECRYPTFS_POLICY_APPLIED) - || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, - ECRYPTFS_KEY_VALID)) { - rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file); + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) + || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { + rc = ecryptfs_read_metadata(ecryptfs_dentry, lower_file); if (rc) { ecryptfs_printk(KERN_DEBUG, "Valid headers not found\n"); @@ -327,9 +318,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file) mutex_unlock(&crypt_stat->cs_mutex); goto out_puts; } - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, - ECRYPTFS_ENCRYPTED); rc = 0; + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); mutex_unlock(&crypt_stat->cs_mutex); goto out; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 11f5e5076ae..9fa7e0b27a9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -3,7 +3,7 @@ * * Copyright (C) 1997-2004 Erez Zadok * Copyright (C) 2001-2004 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompsion <mcthomps@us.ibm.com> * @@ -161,17 +161,17 @@ static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file, ecryptfs_set_file_lower(&fake_file, lower_file); rc = ecryptfs_fill_zeros(&fake_file, 1); if (rc) { - ECRYPTFS_SET_FLAG( - ecryptfs_inode_to_private(inode)->crypt_stat.flags, - ECRYPTFS_SECURITY_WARNING); + ecryptfs_inode_to_private(inode)->crypt_stat.flags |= + ECRYPTFS_SECURITY_WARNING; ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros " "in file; rc = [%d]\n", rc); goto out; } i_size_write(inode, 0); - ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode); - ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags, - ECRYPTFS_NEW_FILE); + ecryptfs_write_inode_size_to_metadata(lower_file, lower_inode, inode, + ecryptfs_dentry, + ECRYPTFS_LOWER_I_MUTEX_NOT_HELD); + ecryptfs_inode_to_private(inode)->crypt_stat.flags |= ECRYPTFS_NEW_FILE; out: return rc; } @@ -199,7 +199,7 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) lower_dentry->d_name.name); inode = ecryptfs_dentry->d_inode; crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; - lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR; + lower_flags = ((O_CREAT | O_TRUNC) & O_ACCMODE) | O_RDWR; #if BITS_PER_LONG != 32 lower_flags |= O_LARGEFILE; #endif @@ -214,10 +214,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) lower_inode = lower_dentry->d_inode; if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out_fput; } - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE); + crypt_stat->flags |= ECRYPTFS_NEW_FILE; ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); rc = ecryptfs_new_file_context(ecryptfs_dentry); if (rc) { @@ -225,7 +225,7 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context\n"); goto out_fput; } - rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file); + rc = ecryptfs_write_metadata(ecryptfs_dentry, lower_file); if (rc) { ecryptfs_printk(KERN_DEBUG, "Error writing headers\n"); goto out_fput; @@ -287,6 +287,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, char *encoded_name; unsigned int encoded_namelen; struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *page_virt = NULL; struct inode *lower_inode; u64 file_size; @@ -361,34 +362,44 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, goto out; } /* Released in this function */ - page_virt = - (char *)kmem_cache_alloc(ecryptfs_header_cache_2, - GFP_USER); + page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, + GFP_USER); if (!page_virt) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Cannot ecryptfs_kmalloc a page\n"); goto out_dput; } - memset(page_virt, 0, PAGE_CACHE_SIZE); - rc = ecryptfs_read_header_region(page_virt, lower_dentry, nd->mnt); crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) ecryptfs_set_default_sizes(crypt_stat); + rc = ecryptfs_read_and_validate_header_region(page_virt, lower_dentry, + nd->mnt); if (rc) { - rc = 0; - ecryptfs_printk(KERN_WARNING, "Error reading header region;" - " assuming unencrypted\n"); - } else { - if (!contains_ecryptfs_marker(page_virt - + ECRYPTFS_FILE_SIZE_BYTES)) { + rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); + if (rc) { + printk(KERN_DEBUG "Valid metadata not found in header " + "region or xattr region; treating file as " + "unencrypted\n"); + rc = 0; kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; } + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + file_size = (crypt_stat->header_extent_size + + i_size_read(lower_dentry->d_inode)); + else + file_size = i_size_read(lower_dentry->d_inode); + } else { memcpy(&file_size, page_virt, sizeof(file_size)); file_size = be64_to_cpu(file_size); - i_size_write(dentry->d_inode, (loff_t)file_size); } + i_size_write(dentry->d_inode, (loff_t)file_size); kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; @@ -782,20 +793,26 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_fput; } i_size_write(inode, new_length); - rc = ecryptfs_write_inode_size_to_header(lower_file, - lower_dentry->d_inode, - inode); + rc = ecryptfs_write_inode_size_to_metadata( + lower_file, lower_dentry->d_inode, inode, dentry, + ECRYPTFS_LOWER_I_MUTEX_NOT_HELD); if (rc) { - ecryptfs_printk(KERN_ERR, - "Problem with ecryptfs_write" - "_inode_size\n"); + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); goto out_fput; } } else { /* new_length < i_size_read(inode) */ vmtruncate(inode, new_length); - ecryptfs_write_inode_size_to_header(lower_file, - lower_dentry->d_inode, - inode); + rc = ecryptfs_write_inode_size_to_metadata( + lower_file, lower_dentry->d_inode, inode, dentry, + ECRYPTFS_LOWER_I_MUTEX_NOT_HELD); + if (rc) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); + goto out_fput; + } /* We are reducing the size of the ecryptfs file, and need to * know if we need to reduce the size of the lower file. */ lower_size_before_truncate = @@ -882,7 +899,7 @@ out: return rc; } -static int +int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { @@ -902,7 +919,7 @@ out: return rc; } -static ssize_t +ssize_t ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { @@ -972,7 +989,7 @@ int ecryptfs_inode_set(struct inode *inode, void *lower_inode) return 0; } -struct inode_operations ecryptfs_symlink_iops = { +const struct inode_operations ecryptfs_symlink_iops = { .readlink = ecryptfs_readlink, .follow_link = ecryptfs_follow_link, .put_link = ecryptfs_put_link, @@ -984,7 +1001,7 @@ struct inode_operations ecryptfs_symlink_iops = { .removexattr = ecryptfs_removexattr }; -struct inode_operations ecryptfs_dir_iops = { +const struct inode_operations ecryptfs_dir_iops = { .create = ecryptfs_create, .lookup = ecryptfs_lookup, .link = ecryptfs_link, @@ -1002,7 +1019,7 @@ struct inode_operations ecryptfs_dir_iops = { .removexattr = ecryptfs_removexattr }; -struct inode_operations ecryptfs_main_iops = { +const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .setxattr = ecryptfs_setxattr, diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 745c0f1bfbb..b550dea8eee 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -7,6 +7,7 @@ * Copyright (C) 2004-2006 International Business Machines Corp. * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> + * Trevor S. Highland <trevor.highland@gmail.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -25,7 +26,6 @@ */ #include <linux/string.h> -#include <linux/sched.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/key.h> @@ -64,26 +64,6 @@ int process_request_key_err(long err_code) return rc; } -static void wipe_auth_tok_list(struct list_head *auth_tok_list_head) -{ - struct list_head *walker; - struct ecryptfs_auth_tok_list_item *auth_tok_list_item; - - walker = auth_tok_list_head->next; - while (walker != auth_tok_list_head) { - auth_tok_list_item = - list_entry(walker, struct ecryptfs_auth_tok_list_item, - list); - walker = auth_tok_list_item->list.next; - memset(auth_tok_list_item, 0, - sizeof(struct ecryptfs_auth_tok_list_item)); - kmem_cache_free(ecryptfs_auth_tok_list_item_cache, - auth_tok_list_item); - } -} - -struct kmem_cache *ecryptfs_auth_tok_list_item_cache; - /** * parse_packet_length * @data: Pointer to memory containing length at offset @@ -102,12 +82,12 @@ static int parse_packet_length(unsigned char *data, size_t *size, (*size) = 0; if (data[0] < 192) { /* One-byte length */ - (*size) = data[0]; + (*size) = (unsigned char)data[0]; (*length_size) = 1; } else if (data[0] < 224) { /* Two-byte length */ - (*size) = ((data[0] - 192) * 256); - (*size) += (data[1] + 192); + (*size) = (((unsigned char)(data[0]) - 192) * 256); + (*size) += ((unsigned char)(data[1]) + 192); (*length_size) = 2; } else if (data[0] == 255) { /* Five-byte length; we're not supposed to see this */ @@ -154,6 +134,499 @@ static int write_packet_length(char *dest, size_t size, return rc; } +static int +write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key, + char **packet, size_t *packet_len) +{ + size_t i = 0; + size_t data_len; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 64 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + + session_key->encrypted_key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE; + rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + rc = write_packet_length(&message[i], session_key->encrypted_key_size, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], session_key->encrypted_key, + session_key->encrypted_key_size); + i += session_key->encrypted_key_size; + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_65_packet(struct ecryptfs_session_key *session_key, u16 *cipher_code, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t m_size; + size_t message_len; + u16 checksum = 0; + u16 expected_checksum = 0; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + if (message_len < 4) { + rc = -EIO; + goto out; + } + if (data[i++] != ECRYPTFS_TAG_65_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_65\n"); + rc = -EIO; + goto out; + } + if (data[i++]) { + ecryptfs_printk(KERN_ERR, "Status indicator has non-zero value " + "[%d]\n", data[i-1]); + rc = -EIO; + goto out; + } + rc = parse_packet_length(&data[i], &m_size, &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + m_size)) { + ecryptfs_printk(KERN_ERR, "The received netlink message is " + "shorter than expected\n"); + rc = -EIO; + goto out; + } + if (m_size < 3) { + ecryptfs_printk(KERN_ERR, + "The decrypted key is not long enough to " + "include a cipher code and checksum\n"); + rc = -EIO; + goto out; + } + *cipher_code = data[i++]; + /* The decrypted key includes 1 byte cipher code and 2 byte checksum */ + session_key->decrypted_key_size = m_size - 3; + if (session_key->decrypted_key_size > ECRYPTFS_MAX_KEY_BYTES) { + ecryptfs_printk(KERN_ERR, "key_size [%d] larger than " + "the maximum key size [%d]\n", + session_key->decrypted_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + rc = -EIO; + goto out; + } + memcpy(session_key->decrypted_key, &data[i], + session_key->decrypted_key_size); + i += session_key->decrypted_key_size; + expected_checksum += (unsigned char)(data[i++]) << 8; + expected_checksum += (unsigned char)(data[i++]); + for (i = 0; i < session_key->decrypted_key_size; i++) + checksum += session_key->decrypted_key[i]; + if (expected_checksum != checksum) { + ecryptfs_printk(KERN_ERR, "Invalid checksum for file " + "encryption key; expected [%x]; calculated " + "[%x]\n", expected_checksum, checksum); + rc = -EIO; + } +out: + return rc; +} + + +static int +write_tag_66_packet(char *signature, size_t cipher_code, + struct ecryptfs_crypt_stat *crypt_stat, char **packet, + size_t *packet_len) +{ + size_t i = 0; + size_t j; + size_t data_len; + size_t checksum = 0; + size_t packet_size_len; + char *message; + int rc; + + /* + * ***** TAG 66 Packet Format ***** + * | Content Type | 1 byte | + * | Key Identifier Size | 1 or 2 bytes | + * | Key Identifier | arbitrary | + * | File Encryption Key Size | 1 or 2 bytes | + * | File Encryption Key | arbitrary | + */ + data_len = (5 + ECRYPTFS_SIG_SIZE_HEX + crypt_stat->key_size); + *packet = kmalloc(data_len, GFP_KERNEL); + message = *packet; + if (!message) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE; + rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); + i += ECRYPTFS_SIG_SIZE_HEX; + /* The encrypted key includes 1 byte cipher code and 2 byte checksum */ + rc = write_packet_length(&message[i], crypt_stat->key_size + 3, + &packet_size_len); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " + "header; cannot generate packet length\n"); + goto out; + } + i += packet_size_len; + message[i++] = cipher_code; + memcpy(&message[i], crypt_stat->key, crypt_stat->key_size); + i += crypt_stat->key_size; + for (j = 0; j < crypt_stat->key_size; j++) + checksum += crypt_stat->key[j]; + message[i++] = (checksum / 256) % 256; + message[i++] = (checksum % 256); + *packet_len = i; +out: + return rc; +} + +static int +parse_tag_67_packet(struct ecryptfs_key_record *key_rec, + struct ecryptfs_message *msg) +{ + size_t i = 0; + char *data; + size_t data_len; + size_t message_len; + int rc; + + /* + * ***** TAG 65 Packet Format ***** + * | Content Type | 1 byte | + * | Status Indicator | 1 byte | + * | Encrypted File Encryption Key Size | 1 or 2 bytes | + * | Encrypted File Encryption Key | arbitrary | + */ + message_len = msg->data_len; + data = msg->data; + /* verify that everything through the encrypted FEK size is present */ + if (message_len < 4) { + rc = -EIO; + goto out; + } + if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_67\n"); + rc = -EIO; + goto out; + } + if (data[i++]) { + ecryptfs_printk(KERN_ERR, "Status indicator has non zero value" + " [%d]\n", data[i-1]); + rc = -EIO; + goto out; + } + rc = parse_packet_length(&data[i], &key_rec->enc_key_size, &data_len); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out; + } + i += data_len; + if (message_len < (i + key_rec->enc_key_size)) { + ecryptfs_printk(KERN_ERR, "message_len [%d]; max len is [%d]\n", + message_len, (i + key_rec->enc_key_size)); + rc = -EIO; + goto out; + } + if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + ecryptfs_printk(KERN_ERR, "Encrypted key_size [%d] larger than " + "the maximum key size [%d]\n", + key_rec->enc_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); + rc = -EIO; + goto out; + } + memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size); +out: + return rc; +} + +/** + * decrypt_pki_encrypted_session_key - Decrypt the session key with + * the given auth_tok. + * + * Returns Zero on success; non-zero error otherwise. + */ +static int decrypt_pki_encrypted_session_key( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + u16 cipher_code = 0; + struct ecryptfs_msg_ctx *msg_ctx; + struct ecryptfs_message *msg = NULL; + char *netlink_message; + size_t netlink_message_length; + int rc; + + rc = write_tag_64_packet(mount_crypt_stat->global_auth_tok_sig, + &(auth_tok->session_key), + &netlink_message, &netlink_message_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet"); + goto out; + } + rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, + netlink_message_length, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 65 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_65_packet(&(auth_tok->session_key), + &cipher_code, msg); + if (rc) { + printk(KERN_ERR "Failed to parse tag 65 packet; rc = [%d]\n", + rc); + goto out; + } + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + crypt_stat->key_size = auth_tok->session_key.decrypted_key_size; + rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code); + if (rc) { + ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n", + cipher_code) + goto out; + } + crypt_stat->flags |= ECRYPTFS_KEY_VALID; + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +out: + if (msg) + kfree(msg); + return rc; +} + +static void wipe_auth_tok_list(struct list_head *auth_tok_list_head) +{ + struct list_head *walker; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + + walker = auth_tok_list_head->next; + while (walker != auth_tok_list_head) { + auth_tok_list_item = + list_entry(walker, struct ecryptfs_auth_tok_list_item, + list); + walker = auth_tok_list_item->list.next; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + } + auth_tok_list_head->next = NULL; +} + +struct kmem_cache *ecryptfs_auth_tok_list_item_cache; + + +/** + * parse_tag_1_packet + * @crypt_stat: The cryptographic context to modify based on packet + * contents. + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the end + * of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + int rc = 0; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + + /* we check that: + * one byte for the Tag 1 ID flag + * two bytes for the body size + * do not exceed the maximum_packet_size + */ + if (unlikely((*packet_size) + 3 > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + /* check for Tag 1 identifier - one byte */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_1_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n", + ECRYPTFS_TAG_1_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, + GFP_KERNEL); + if (!auth_tok_list_item) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + /* check for body size - one to two bytes + * + * ***** TAG 1 Packet Format ***** + * | version number | 1 byte | + * | key ID | 8 bytes | + * | public key algorithm | 1 byte | + * | encrypted session key | arbitrary | + */ + rc = parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out_free; + } + if (unlikely(body_size < (0x02 + ECRYPTFS_SIG_SIZE))) { + ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n", + body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + if (unlikely((*packet_size) + body_size > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + /* Version 3 (from RFC2440) - one byte */ + if (unlikely(data[(*packet_size)++] != 0x03)) { + ecryptfs_printk(KERN_DEBUG, "Unknown version number " + "[%d]\n", data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + /* Read Signature */ + ecryptfs_to_hex((*new_auth_tok)->token.private_key.signature, + &data[(*packet_size)], ECRYPTFS_SIG_SIZE); + *packet_size += ECRYPTFS_SIG_SIZE; + /* This byte is skipped because the kernel does not need to + * know which public key encryption algorithm was used */ + (*packet_size)++; + (*new_auth_tok)->session_key.encrypted_key_size = + body_size - (0x02 + ECRYPTFS_SIG_SIZE); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + ecryptfs_printk(KERN_ERR, "Tag 1 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES"); + rc = -EINVAL; + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n", + (*new_auth_tok)->session_key.encrypted_key_size); + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], (body_size - 0x02 - ECRYPTFS_SIG_SIZE)); + (*packet_size) += (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token_type = ECRYPTFS_PRIVATE_KEY; + (*new_auth_tok)->flags |= ECRYPTFS_PRIVATE_KEY; + /* TODO: Why are we setting this flag here? Don't we want the + * userspace to decrypt the session key? */ + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + /** * parse_tag_3_packet * @crypt_stat: The cryptographic context to modify based on packet @@ -178,10 +651,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_auth_tok **new_auth_tok, size_t *packet_size, size_t max_packet_size) { - int rc = 0; size_t body_size; struct ecryptfs_auth_tok_list_item *auth_tok_list_item; size_t length_size; + int rc = 0; (*packet_size) = 0; (*new_auth_tok) = NULL; @@ -207,14 +680,12 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or * at end of function upon failure */ auth_tok_list_item = - kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL); + kmem_cache_zalloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL); if (!auth_tok_list_item) { ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); rc = -ENOMEM; goto out; } - memset(auth_tok_list_item, 0, - sizeof(struct ecryptfs_auth_tok_list_item)); (*new_auth_tok) = &auth_tok_list_item->auth_tok; /* check for body size - one to two bytes */ @@ -321,10 +792,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD; /* TODO: Parametarize; we might actually want userspace to * decrypt the session key. */ - ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags, - ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); - ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags, - ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + (*new_auth_tok)->session_key.flags &= + ~(ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); list_add(&auth_tok_list_item->list, auth_tok_list); goto out; out_free: @@ -360,9 +831,9 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, size_t max_contents_bytes, size_t *tag_11_contents_size, size_t *packet_size, size_t max_packet_size) { - int rc = 0; size_t body_size; size_t length_size; + int rc = 0; (*packet_size) = 0; (*tag_11_contents_size) = 0; @@ -461,7 +932,6 @@ static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_password *password_s_ptr; struct scatterlist src_sg[2], dst_sg[2]; struct mutex *tfm_mutex = NULL; - /* TODO: Use virt_to_scatterlist for these */ char *encrypted_session_key; char *session_key; struct blkcipher_desc desc = { @@ -470,8 +940,7 @@ static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok, int rc = 0; password_s_ptr = &auth_tok->token.password; - if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags, - ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) + if (password_s_ptr->flags & ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) ecryptfs_printk(KERN_DEBUG, "Session key encryption key " "set; skipping key generation\n"); ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])" @@ -553,7 +1022,7 @@ static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok, auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, auth_tok->session_key.decrypted_key_size); - ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + crypt_stat->flags |= ECRYPTFS_KEY_VALID; ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n"); if (ecryptfs_verbosity > 0) ecryptfs_dump_hex(crypt_stat->key, @@ -589,7 +1058,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, struct dentry *ecryptfs_dentry) { size_t i = 0; - int rc = 0; size_t found_auth_tok = 0; size_t next_packet_is_auth_tok_packet; char sig[ECRYPTFS_SIG_SIZE_HEX]; @@ -605,6 +1073,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE]; size_t tag_11_contents_size; size_t tag_11_packet_size; + int rc = 0; INIT_LIST_HEAD(&auth_tok_list); /* Parse the header to find as many packets as we can, these will be @@ -656,8 +1125,21 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, sig_tmp_space, tag_11_contents_size); new_auth_tok->token.password.signature[ ECRYPTFS_PASSWORD_SIG_SIZE] = '\0'; - ECRYPTFS_SET_FLAG(crypt_stat->flags, - ECRYPTFS_ENCRYPTED); + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; + break; + case ECRYPTFS_TAG_1_PACKET_TYPE: + rc = parse_tag_1_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 1 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + crypt_stat->flags |= ECRYPTFS_ENCRYPTED; break; case ECRYPTFS_TAG_11_PACKET_TYPE: ecryptfs_printk(KERN_WARNING, "Invalid packet set " @@ -706,31 +1188,46 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, goto leave_list; /* TODO: Transfer the common salt into the * crypt_stat salt */ + } else if ((candidate_auth_tok->token_type + == ECRYPTFS_PRIVATE_KEY) + && !strncmp(candidate_auth_tok->token.private_key.signature, + sig, ECRYPTFS_SIG_SIZE_HEX)) { + found_auth_tok = 1; + goto leave_list; } } -leave_list: if (!found_auth_tok) { ecryptfs_printk(KERN_ERR, "Could not find authentication " "token on temporary list for sig [%.*s]\n", ECRYPTFS_SIG_SIZE_HEX, sig); rc = -EIO; goto out_wipe_list; - } else { + } +leave_list: + rc = -ENOTSUPP; + if (candidate_auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + memcpy(&(candidate_auth_tok->token.private_key), + &(chosen_auth_tok->token.private_key), + sizeof(struct ecryptfs_private_key)); + rc = decrypt_pki_encrypted_session_key(mount_crypt_stat, + candidate_auth_tok, + crypt_stat); + } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { memcpy(&(candidate_auth_tok->token.password), &(chosen_auth_tok->token.password), sizeof(struct ecryptfs_password)); rc = decrypt_session_key(candidate_auth_tok, crypt_stat); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error decrypting the " - "session key\n"); - goto out_wipe_list; - } - rc = ecryptfs_compute_root_iv(crypt_stat); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error computing " - "the root IV\n"); - goto out_wipe_list; - } + } + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting the " + "session key; rc = [%d]\n", rc); + goto out_wipe_list; + } + rc = ecryptfs_compute_root_iv(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error computing " + "the root IV\n"); + goto out_wipe_list; } rc = ecryptfs_init_crypt_ctx(crypt_stat); if (rc) { @@ -743,6 +1240,145 @@ out_wipe_list: out: return rc; } +static int +pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec) +{ + struct ecryptfs_msg_ctx *msg_ctx = NULL; + char *netlink_payload; + size_t netlink_payload_length; + struct ecryptfs_message *msg; + int rc; + + rc = write_tag_66_packet(auth_tok->token.private_key.signature, + ecryptfs_code_for_cipher_string(crypt_stat), + crypt_stat, &netlink_payload, + &netlink_payload_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); + goto out; + } + rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload, + netlink_payload_length, &msg_ctx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + goto out; + } + rc = ecryptfs_wait_for_response(msg_ctx, &msg); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to receive tag 67 packet " + "from the user space daemon\n"); + rc = -EIO; + goto out; + } + rc = parse_tag_67_packet(key_rec, msg); + if (rc) + ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); + kfree(msg); +out: + if (netlink_payload) + kfree(netlink_payload); + return rc; +} +/** + * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet + * @dest: Buffer into which to write the packet + * @max: Maximum number of bytes that can be writtn + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_1_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + size_t i; + size_t encrypted_session_key_valid = 0; + size_t key_rec_size; + size_t packet_size_length; + int rc = 0; + + (*packet_size) = 0; + ecryptfs_from_hex(key_rec->sig, auth_tok->token.private_key.signature, + ECRYPTFS_SIG_SIZE); + encrypted_session_key_valid = 0; + for (i = 0; i < crypt_stat->key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + memcpy(key_rec->enc_key, + auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size); + goto encrypted_session_key_set; + } + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + auth_tok->token.private_key.key_size; + rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failed to encrypt session key " + "via a pki"); + goto out; + } + if (ecryptfs_verbosity > 0) { + ecryptfs_printk(KERN_DEBUG, "Encrypted key:\n"); + ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size); + } +encrypted_session_key_set: + /* Now we have a valid key_rec. Append it to the + * key_rec set. */ + key_rec_size = (sizeof(struct ecryptfs_key_record) + - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES + + (key_rec->enc_key_size)); + /* TODO: Include a packet size limit as a parameter to this + * function once we have multi-packet headers (for versions + * later than 0.1 */ + if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) { + ecryptfs_printk(KERN_ERR, "Keyset too large\n"); + rc = -EINVAL; + goto out; + } + /* ***** TAG 1 Packet Format ***** + * | version number | 1 byte | + * | key ID | 8 bytes | + * | public key algorithm | 1 byte | + * | encrypted session key | arbitrary | + */ + if ((0x02 + ECRYPTFS_SIG_SIZE + key_rec->enc_key_size) >= max) { + ecryptfs_printk(KERN_ERR, + "Authentication token is too large\n"); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE; + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 1 */ + rc = write_packet_length(&dest[(*packet_size)], + (0x02 + ECRYPTFS_SIG_SIZE + + key_rec->enc_key_size), + &packet_size_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet " + "header; cannot generate packet length\n"); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x03; /* version 3 */ + memcpy(&dest[(*packet_size)], key_rec->sig, ECRYPTFS_SIG_SIZE); + (*packet_size) += ECRYPTFS_SIG_SIZE; + dest[(*packet_size)++] = RFC2440_CIPHER_RSA; + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; +out: + if (rc) + (*packet_size) = 0; + return rc; +} /** * write_tag_11_packet @@ -758,8 +1394,8 @@ static int write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length, size_t *packet_length) { - int rc = 0; size_t packet_size_length; + int rc = 0; (*packet_length) = 0; if ((13 + contents_length) > max) { @@ -817,7 +1453,6 @@ write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_key_record *key_rec, size_t *packet_size) { size_t i; - size_t signature_is_valid = 0; size_t encrypted_session_key_valid = 0; char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; struct scatterlist dest_sg[2]; @@ -833,19 +1468,14 @@ write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, int rc = 0; (*packet_size) = 0; - /* Check for a valid signature on the auth_tok */ - for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++) - signature_is_valid |= auth_tok->token.password.signature[i]; - if (!signature_is_valid) - BUG(); - ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature, + ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature, ECRYPTFS_SIG_SIZE); encrypted_session_key_valid = 0; for (i = 0; i < crypt_stat->key_size; i++) encrypted_session_key_valid |= auth_tok->session_key.encrypted_key[i]; if (encrypted_session_key_valid) { - memcpy((*key_rec).enc_key, + memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, auth_tok->session_key.encrypted_key_size); goto encrypted_session_key_set; @@ -858,10 +1488,10 @@ write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, memset((crypt_stat->key + 24), 0, 8); auth_tok->session_key.encrypted_key_size = 32; } - (*key_rec).enc_key_size = + key_rec->enc_key_size = auth_tok->session_key.encrypted_key_size; - if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags, - ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) { + if (auth_tok->token.password.flags & + ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET) { ecryptfs_printk(KERN_DEBUG, "Using previously generated " "session key encryption key of size [%d]\n", auth_tok->token.password. @@ -879,15 +1509,15 @@ write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, ecryptfs_dump_hex(session_key_encryption_key, 16); } rc = virt_to_scatterlist(crypt_stat->key, - (*key_rec).enc_key_size, src_sg, 2); + key_rec->enc_key_size, src_sg, 2); if (!rc) { ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat session key\n"); rc = -ENOMEM; goto out; } - rc = virt_to_scatterlist((*key_rec).enc_key, - (*key_rec).enc_key_size, dest_sg, 2); + rc = virt_to_scatterlist(key_rec->enc_key, + key_rec->enc_key_size, dest_sg, 2); if (!rc) { ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat encrypted session key\n"); @@ -943,14 +1573,14 @@ write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, mutex_unlock(tfm_mutex); ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); if (ecryptfs_verbosity > 0) - ecryptfs_dump_hex((*key_rec).enc_key, - (*key_rec).enc_key_size); + ecryptfs_dump_hex(key_rec->enc_key, + key_rec->enc_key_size); encrypted_session_key_set: /* Now we have a valid key_rec. Append it to the * key_rec set. */ key_rec_size = (sizeof(struct ecryptfs_key_record) - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES - + ((*key_rec).enc_key_size)); + + (key_rec->enc_key_size)); /* TODO: Include a packet size limit as a parameter to this * function once we have multi-packet headers (for versions * later than 0.1 */ @@ -962,7 +1592,7 @@ encrypted_session_key_set: /* TODO: Packet size limit */ /* We have 5 bytes of surrounding packet data */ if ((0x05 + ECRYPTFS_SALT_SIZE - + (*key_rec).enc_key_size) >= max) { + + key_rec->enc_key_size) >= max) { ecryptfs_printk(KERN_ERR, "Authentication token is too " "large\n"); rc = -EINVAL; @@ -974,7 +1604,7 @@ encrypted_session_key_set: /* ver+cipher+s2k+hash+salt+iter+enc_key */ rc = write_packet_length(&dest[(*packet_size)], (0x05 + ECRYPTFS_SALT_SIZE - + (*key_rec).enc_key_size), + + key_rec->enc_key_size), &packet_size_length); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet " @@ -997,9 +1627,9 @@ encrypted_session_key_set: ECRYPTFS_SALT_SIZE); (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */ dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */ - memcpy(&dest[(*packet_size)], (*key_rec).enc_key, - (*key_rec).enc_key_size); - (*packet_size) += (*key_rec).enc_key_size; + memcpy(&dest[(*packet_size)], key_rec->enc_key, + key_rec->enc_key_size); + (*packet_size) += key_rec->enc_key_size; out: if (desc.tfm && !tfm_mutex) crypto_free_blkcipher(desc.tfm); @@ -1008,6 +1638,8 @@ out: return rc; } +struct kmem_cache *ecryptfs_key_record_cache; + /** * ecryptfs_generate_key_packet_set * @dest: Virtual address from which to write the key record set @@ -1029,52 +1661,60 @@ ecryptfs_generate_key_packet_set(char *dest_base, struct dentry *ecryptfs_dentry, size_t *len, size_t max) { - int rc = 0; struct ecryptfs_auth_tok *auth_tok; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; size_t written; - struct ecryptfs_key_record key_rec; + struct ecryptfs_key_record *key_rec; + int rc = 0; (*len) = 0; + key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL); + if (!key_rec) { + rc = -ENOMEM; + goto out; + } if (mount_crypt_stat->global_auth_tok) { auth_tok = mount_crypt_stat->global_auth_tok; if (auth_tok->token_type == ECRYPTFS_PASSWORD) { rc = write_tag_3_packet((dest_base + (*len)), max, auth_tok, - crypt_stat, &key_rec, + crypt_stat, key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 3 packet\n"); - goto out; + goto out_free; } (*len) += written; /* Write auth tok signature packet */ rc = write_tag_11_packet( (dest_base + (*len)), (max - (*len)), - key_rec.sig, ECRYPTFS_SIG_SIZE, &written); + key_rec->sig, ECRYPTFS_SIG_SIZE, &written); if (rc) { ecryptfs_printk(KERN_ERR, "Error writing " "auth tok signature packet\n"); - goto out; + goto out_free; + } + (*len) += written; + } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { + rc = write_tag_1_packet(dest_base + (*len), + max, auth_tok, + crypt_stat,mount_crypt_stat, + key_rec, &written); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 1 packet\n"); + goto out_free; } (*len) += written; } else { ecryptfs_printk(KERN_WARNING, "Unsupported " "authentication token type\n"); rc = -EINVAL; - goto out; - } - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error writing " - "authentication token packet with sig " - "= [%s]\n", - mount_crypt_stat->global_auth_tok_sig); - rc = -EIO; - goto out; + goto out_free; } } else BUG(); @@ -1084,6 +1724,9 @@ ecryptfs_generate_key_packet_set(char *dest_base, ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); rc = -EIO; } + +out_free: + kmem_cache_free(ecryptfs_key_record_cache, key_rec); out: if (rc) (*len) = 0; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d0541ae8fab..80044d196fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -3,9 +3,10 @@ * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> + * Tyler Hicks <tyhicks@ou.edu> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -48,6 +49,43 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); +/** + * Module parameter that defines the number of netlink message buffer + * elements + */ +unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; + +module_param(ecryptfs_message_buf_len, uint, 0); +MODULE_PARM_DESC(ecryptfs_message_buf_len, + "Number of message buffer elements"); + +/** + * Module parameter that defines the maximum guaranteed amount of time to wait + * for a response through netlink. The actual sleep time will be, more than + * likely, a small amount greater than this specified value, but only less if + * the netlink message successfully arrives. + */ +signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; + +module_param(ecryptfs_message_wait_timeout, long, 0); +MODULE_PARM_DESC(ecryptfs_message_wait_timeout, + "Maximum number of seconds that an operation will " + "sleep while waiting for a message response from " + "userspace"); + +/** + * Module parameter that is an estimate of the maximum number of users + * that will be concurrently using eCryptfs. Set this to the right + * value to balance performance and memory use. + */ +unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; + +module_param(ecryptfs_number_of_users, uint, 0); +MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " + "concurrent users of eCryptfs"); + +unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT; + void __ecryptfs_printk(const char *fmt, ...) { va_list args; @@ -124,7 +162,8 @@ out: enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug, ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, - ecryptfs_opt_passthrough, ecryptfs_opt_err }; + ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, + ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; static match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -135,6 +174,8 @@ static match_table_t tokens = { {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, + {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, + {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, {ecryptfs_opt_err, NULL} }; @@ -275,6 +316,16 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) mount_crypt_stat->flags |= ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; break; + case ecryptfs_opt_xattr_metadata: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + break; + case ecryptfs_opt_encrypted_view: + mount_crypt_stat->flags |= + ECRYPTFS_XATTR_METADATA_ENABLED; + mount_crypt_stat->flags |= + ECRYPTFS_ENCRYPTED_VIEW_ENABLED; + break; case ecryptfs_opt_err: default: ecryptfs_printk(KERN_WARNING, @@ -347,9 +398,10 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) rc = -EINVAL; goto out; } - if (auth_tok->token_type != ECRYPTFS_PASSWORD) { + if (auth_tok->token_type != ECRYPTFS_PASSWORD + && auth_tok->token_type != ECRYPTFS_PRIVATE_KEY) { ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure " - "returned from key\n"); + "returned from key query\n"); rc = -EINVAL; goto out; } @@ -378,15 +430,13 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) /* Released in ecryptfs_put_super() */ ecryptfs_set_superblock_private(sb, - kmem_cache_alloc(ecryptfs_sb_info_cache, + kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL)); if (!ecryptfs_superblock_to_private(sb)) { ecryptfs_printk(KERN_WARNING, "Out of memory\n"); rc = -ENOMEM; goto out; } - memset(ecryptfs_superblock_to_private(sb), 0, - sizeof(struct ecryptfs_sb_info)); sb->s_op = &ecryptfs_sops; /* Released through deactivate_super(sb) from get_sb_nodev */ sb->s_root = d_alloc(NULL, &(const struct qstr) { @@ -402,7 +452,7 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) /* Released in d_release when dput(sb->s_root) is called */ /* through deactivate_super(sb) from get_sb_nodev() */ ecryptfs_set_dentry_private(sb->s_root, - kmem_cache_alloc(ecryptfs_dentry_info_cache, + kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); if (!ecryptfs_dentry_to_private(sb->s_root)) { ecryptfs_printk(KERN_ERR, @@ -410,8 +460,6 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) rc = -ENOMEM; goto out; } - memset(ecryptfs_dentry_to_private(sb->s_root), 0, - sizeof(struct ecryptfs_dentry_info)); rc = 0; out: /* Should be able to rely on deactivate_super called from @@ -594,10 +642,20 @@ static struct ecryptfs_cache_info { .size = PAGE_CACHE_SIZE, }, { + .cache = &ecryptfs_xattr_cache, + .name = "ecryptfs_xattr_cache", + .size = PAGE_CACHE_SIZE, + }, + { .cache = &ecryptfs_lower_page_cache, .name = "ecryptfs_lower_page_cache", .size = PAGE_CACHE_SIZE, }, + { + .cache = &ecryptfs_key_record_cache, + .name = "ecryptfs_key_record_cache", + .size = sizeof(struct ecryptfs_key_record), + }, }; static void ecryptfs_free_kmem_caches(void) @@ -699,7 +757,8 @@ static struct ecryptfs_version_str_map_elem { {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"}, {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"}, {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"}, - {ECRYPTFS_VERSIONING_POLICY, "policy"} + {ECRYPTFS_VERSIONING_POLICY, "policy"}, + {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"} }; static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff) @@ -798,6 +857,11 @@ static int __init ecryptfs_init(void) ecryptfs_free_kmem_caches(); goto out; } + rc = ecryptfs_init_messaging(ecryptfs_transport); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failure occured while attempting to " + "initialize the eCryptfs netlink socket\n"); + } out: return rc; } @@ -809,6 +873,7 @@ static void __exit ecryptfs_exit(void) sysfs_remove_file(&ecryptfs_subsys.kset.kobj, &sysfs_attr_version_str.attr); subsystem_unregister(&ecryptfs_subsys); + ecryptfs_release_messaging(ecryptfs_transport); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); } diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c new file mode 100644 index 00000000000..3baf253be95 --- /dev/null +++ b/fs/ecryptfs/messaging.c @@ -0,0 +1,516 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> + * Tyler Hicks <tyhicks@ou.edu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include "ecryptfs_kernel.h" + +static LIST_HEAD(ecryptfs_msg_ctx_free_list); +static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); +static struct mutex ecryptfs_msg_ctx_lists_mux; + +static struct hlist_head *ecryptfs_daemon_id_hash; +static struct mutex ecryptfs_daemon_id_hash_mux; +static int ecryptfs_hash_buckets; +#define ecryptfs_uid_hash(uid) \ + hash_long((unsigned long)uid, ecryptfs_hash_buckets) + +static unsigned int ecryptfs_msg_counter; +static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; + +/** + * ecryptfs_acquire_free_msg_ctx + * @msg_ctx: The context that was acquired from the free list + * + * Acquires a context element from the free list and locks the mutex + * on the context. Returns zero on success; non-zero on error or upon + * failure to acquire a free context element. Be sure to lock the + * list mutex before calling. + */ +static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx) +{ + struct list_head *p; + int rc; + + if (list_empty(&ecryptfs_msg_ctx_free_list)) { + ecryptfs_printk(KERN_WARNING, "The eCryptfs free " + "context list is empty. It may be helpful to " + "specify the ecryptfs_message_buf_len " + "parameter to be greater than the current " + "value of [%d]\n", ecryptfs_message_buf_len); + rc = -ENOMEM; + goto out; + } + list_for_each(p, &ecryptfs_msg_ctx_free_list) { + *msg_ctx = list_entry(p, struct ecryptfs_msg_ctx, node); + if (mutex_trylock(&(*msg_ctx)->mux)) { + (*msg_ctx)->task = current; + rc = 0; + goto out; + } + } + rc = -ENOMEM; +out: + return rc; +} + +/** + * ecryptfs_msg_ctx_free_to_alloc + * @msg_ctx: The context to move from the free list to the alloc list + * + * Be sure to lock the list mutex and the context mutex before + * calling. + */ +static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&msg_ctx->node, &ecryptfs_msg_ctx_alloc_list); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_PENDING; + msg_ctx->counter = ++ecryptfs_msg_counter; +} + +/** + * ecryptfs_msg_ctx_alloc_to_free + * @msg_ctx: The context to move from the alloc list to the free list + * + * Be sure to lock the list mutex and the context mutex before + * calling. + */ +static void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) +{ + list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); + if (msg_ctx->msg) + kfree(msg_ctx->msg); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; +} + +/** + * ecryptfs_find_daemon_id + * @uid: The user id which maps to the desired daemon id + * @id: If return value is zero, points to the desired daemon id + * pointer + * + * Search the hash list for the given user id. Returns zero if the + * user id exists in the list; non-zero otherwise. The daemon id hash + * mutex should be held before calling this function. + */ +static int ecryptfs_find_daemon_id(uid_t uid, struct ecryptfs_daemon_id **id) +{ + struct hlist_node *elem; + int rc; + + hlist_for_each_entry(*id, elem, + &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)], + id_chain) { + if ((*id)->uid == uid) { + rc = 0; + goto out; + } + } + rc = -EINVAL; +out: + return rc; +} + +static int ecryptfs_send_raw_message(unsigned int transport, u16 msg_type, + pid_t pid) +{ + int rc; + + switch(transport) { + case ECRYPTFS_TRANSPORT_NETLINK: + rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, pid); + break; + case ECRYPTFS_TRANSPORT_CONNECTOR: + case ECRYPTFS_TRANSPORT_RELAYFS: + default: + rc = -ENOSYS; + } + return rc; +} + +/** + * ecryptfs_process_helo + * @transport: The underlying transport (netlink, etc.) + * @uid: The user ID owner of the message + * @pid: The process ID for the userspace program that sent the + * message + * + * Adds the uid and pid values to the daemon id hash. If a uid + * already has a daemon pid registered, the daemon will be + * unregistered before the new daemon id is put into the hash list. + * Returns zero after adding a new daemon id to the hash list; + * non-zero otherwise. + */ +int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid) +{ + struct ecryptfs_daemon_id *new_id; + struct ecryptfs_daemon_id *old_id; + int rc; + + mutex_lock(&ecryptfs_daemon_id_hash_mux); + new_id = kmalloc(sizeof(*new_id), GFP_KERNEL); + if (!new_id) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable " + "to register daemon [%d] for user [%d]\n", + pid, uid); + goto unlock; + } + if (!ecryptfs_find_daemon_id(uid, &old_id)) { + printk(KERN_WARNING "Received request from user [%d] " + "to register daemon [%d]; unregistering daemon " + "[%d]\n", uid, pid, old_id->pid); + hlist_del(&old_id->id_chain); + rc = ecryptfs_send_raw_message(transport, ECRYPTFS_NLMSG_QUIT, + old_id->pid); + if (rc) + printk(KERN_WARNING "Failed to send QUIT " + "message to daemon [%d]; rc = [%d]\n", + old_id->pid, rc); + kfree(old_id); + } + new_id->uid = uid; + new_id->pid = pid; + hlist_add_head(&new_id->id_chain, + &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)]); + rc = 0; +unlock: + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + return rc; +} + +/** + * ecryptfs_process_quit + * @uid: The user ID owner of the message + * @pid: The process ID for the userspace program that sent the + * message + * + * Deletes the corresponding daemon id for the given uid and pid, if + * it is the registered that is requesting the deletion. Returns zero + * after deleting the desired daemon id; non-zero otherwise. + */ +int ecryptfs_process_quit(uid_t uid, pid_t pid) +{ + struct ecryptfs_daemon_id *id; + int rc; + + mutex_lock(&ecryptfs_daemon_id_hash_mux); + if (ecryptfs_find_daemon_id(uid, &id)) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Received request from user [%d] to " + "unregister unrecognized daemon [%d]\n", uid, + pid); + goto unlock; + } + if (id->pid != pid) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Received request from user [%d] " + "with pid [%d] to unregister daemon [%d]\n", + uid, pid, id->pid); + goto unlock; + } + hlist_del(&id->id_chain); + kfree(id); + rc = 0; +unlock: + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + return rc; +} + +/** + * ecryptfs_process_reponse + * @msg: The ecryptfs message received; the caller should sanity check + * msg->data_len + * @pid: The process ID of the userspace application that sent the + * message + * @seq: The sequence number of the message + * + * Processes a response message after sending a operation request to + * userspace. Returns zero upon delivery to desired context element; + * non-zero upon delivery failure or error. + */ +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid, + pid_t pid, u32 seq) +{ + struct ecryptfs_daemon_id *id; + struct ecryptfs_msg_ctx *msg_ctx; + int msg_size; + int rc; + + if (msg->index >= ecryptfs_message_buf_len) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Attempt to reference " + "context buffer at index [%d]; maximum " + "allowable is [%d]\n", msg->index, + (ecryptfs_message_buf_len - 1)); + goto out; + } + msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; + mutex_lock(&msg_ctx->mux); + if (ecryptfs_find_daemon_id(msg_ctx->task->euid, &id)) { + rc = -EBADMSG; + ecryptfs_printk(KERN_WARNING, "User [%d] received a " + "message response from process [%d] but does " + "not have a registered daemon\n", + msg_ctx->task->euid, pid); + goto wake_up; + } + if (msg_ctx->task->euid != uid) { + rc = -EBADMSG; + ecryptfs_printk(KERN_WARNING, "Received message from user " + "[%d]; expected message from user [%d]\n", + uid, msg_ctx->task->euid); + goto unlock; + } + if (id->pid != pid) { + rc = -EBADMSG; + ecryptfs_printk(KERN_ERR, "User [%d] received a " + "message response from an unrecognized " + "process [%d]\n", msg_ctx->task->euid, pid); + goto unlock; + } + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Desired context element is not " + "pending a response\n"); + goto unlock; + } else if (msg_ctx->counter != seq) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Invalid message sequence; " + "expected [%d]; received [%d]\n", + msg_ctx->counter, seq); + goto unlock; + } + msg_size = sizeof(*msg) + msg->data_len; + msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); + goto unlock; + } + memcpy(msg_ctx->msg, msg, msg_size); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; + rc = 0; +wake_up: + wake_up_process(msg_ctx->task); +unlock: + mutex_unlock(&msg_ctx->mux); +out: + return rc; +} + +/** + * ecryptfs_send_message + * @transport: The transport over which to send the message (i.e., + * netlink) + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + */ +int ecryptfs_send_message(unsigned int transport, char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + struct ecryptfs_daemon_id *id; + int rc; + + mutex_lock(&ecryptfs_daemon_id_hash_mux); + if (ecryptfs_find_daemon_id(current->euid, &id)) { + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + rc = -ENOTCONN; + ecryptfs_printk(KERN_ERR, "User [%d] does not have a daemon " + "registered\n", current->euid); + goto out; + } + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + rc = ecryptfs_acquire_free_msg_ctx(msg_ctx); + if (rc) { + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + ecryptfs_printk(KERN_WARNING, "Could not claim a free " + "context element\n"); + goto out; + } + ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); + mutex_unlock(&(*msg_ctx)->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + switch (transport) { + case ECRYPTFS_TRANSPORT_NETLINK: + rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, + ECRYPTFS_NLMSG_REQUEST, 0, id->pid); + break; + case ECRYPTFS_TRANSPORT_CONNECTOR: + case ECRYPTFS_TRANSPORT_RELAYFS: + default: + rc = -ENOSYS; + } + if (rc) { + printk(KERN_ERR "Error attempting to send message to userspace " + "daemon; rc = [%d]\n", rc); + } +out: + return rc; +} + +/** + * ecryptfs_wait_for_response + * @msg_ctx: The context that was assigned when sending a message + * @msg: The incoming message from userspace; not set if rc != 0 + * + * Sleeps until awaken by ecryptfs_receive_message or until the amount + * of time exceeds ecryptfs_message_wait_timeout. If zero is + * returned, msg will point to a valid message from userspace; a + * non-zero value is returned upon failure to receive a message or an + * error occurs. + */ +int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **msg) +{ + signed long timeout = ecryptfs_message_wait_timeout * HZ; + int rc = 0; + +sleep: + timeout = schedule_timeout_interruptible(timeout); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_DONE) { + if (timeout) { + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + goto sleep; + } + rc = -ENOMSG; + } else { + *msg = msg_ctx->msg; + msg_ctx->msg = NULL; + } + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + mutex_unlock(&msg_ctx->mux); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + return rc; +} + +int ecryptfs_init_messaging(unsigned int transport) +{ + int i; + int rc = 0; + + if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) { + ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS; + ecryptfs_printk(KERN_WARNING, "Specified number of users is " + "too large, defaulting to [%d] users\n", + ecryptfs_number_of_users); + } + mutex_init(&ecryptfs_daemon_id_hash_mux); + mutex_lock(&ecryptfs_daemon_id_hash_mux); + ecryptfs_hash_buckets = 0; + while (ecryptfs_number_of_users >> ++ecryptfs_hash_buckets); + ecryptfs_daemon_id_hash = kmalloc(sizeof(struct hlist_head) + * ecryptfs_hash_buckets, GFP_KERNEL); + if (!ecryptfs_daemon_id_hash) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); + goto out; + } + for (i = 0; i < ecryptfs_hash_buckets; i++) + INIT_HLIST_HEAD(&ecryptfs_daemon_id_hash[i]); + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + + ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) + * ecryptfs_message_buf_len), GFP_KERNEL); + if (!ecryptfs_msg_ctx_arr) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); + goto out; + } + mutex_init(&ecryptfs_msg_ctx_lists_mux); + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + ecryptfs_msg_counter = 0; + for (i = 0; i < ecryptfs_message_buf_len; i++) { + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node); + mutex_init(&ecryptfs_msg_ctx_arr[i].mux); + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + ecryptfs_msg_ctx_arr[i].index = i; + ecryptfs_msg_ctx_arr[i].state = ECRYPTFS_MSG_CTX_STATE_FREE; + ecryptfs_msg_ctx_arr[i].counter = 0; + ecryptfs_msg_ctx_arr[i].task = NULL; + ecryptfs_msg_ctx_arr[i].msg = NULL; + list_add_tail(&ecryptfs_msg_ctx_arr[i].node, + &ecryptfs_msg_ctx_free_list); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + switch(transport) { + case ECRYPTFS_TRANSPORT_NETLINK: + rc = ecryptfs_init_netlink(); + if (rc) + ecryptfs_release_messaging(transport); + break; + case ECRYPTFS_TRANSPORT_CONNECTOR: + case ECRYPTFS_TRANSPORT_RELAYFS: + default: + rc = -ENOSYS; + } +out: + return rc; +} + +void ecryptfs_release_messaging(unsigned int transport) +{ + if (ecryptfs_msg_ctx_arr) { + int i; + + mutex_lock(&ecryptfs_msg_ctx_lists_mux); + for (i = 0; i < ecryptfs_message_buf_len; i++) { + mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); + if (ecryptfs_msg_ctx_arr[i].msg) + kfree(ecryptfs_msg_ctx_arr[i].msg); + mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); + } + kfree(ecryptfs_msg_ctx_arr); + mutex_unlock(&ecryptfs_msg_ctx_lists_mux); + } + if (ecryptfs_daemon_id_hash) { + struct hlist_node *elem; + struct ecryptfs_daemon_id *id; + int i; + + mutex_lock(&ecryptfs_daemon_id_hash_mux); + for (i = 0; i < ecryptfs_hash_buckets; i++) { + hlist_for_each_entry(id, elem, + &ecryptfs_daemon_id_hash[i], + id_chain) { + hlist_del(elem); + kfree(id); + } + } + kfree(ecryptfs_daemon_id_hash); + mutex_unlock(&ecryptfs_daemon_id_hash_mux); + } + switch(transport) { + case ECRYPTFS_TRANSPORT_NETLINK: + ecryptfs_release_netlink(); + break; + case ECRYPTFS_TRANSPORT_CONNECTOR: + case ECRYPTFS_TRANSPORT_RELAYFS: + default: + break; + } + return; +} diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 06843d24f23..3a6f65c3f14 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -6,7 +6,7 @@ * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * * This program is free software; you can redistribute it and/or @@ -234,22 +234,13 @@ int ecryptfs_do_readpage(struct file *file, struct page *page, goto out; } wait_on_page_locked(lower_page); - page_data = (char *)kmap(page); - if (!page_data) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Error mapping page\n"); - goto out; - } - lower_page_data = (char *)kmap(lower_page); - if (!lower_page_data) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Error mapping page\n"); - kunmap(page); - goto out; - } + page_data = kmap_atomic(page, KM_USER0); + lower_page_data = kmap_atomic(lower_page, KM_USER1); memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE); - kunmap(lower_page); - kunmap(page); + kunmap_atomic(lower_page_data, KM_USER1); + flush_dcache_page(lower_page); + kunmap_atomic(page_data, KM_USER0); + flush_dcache_page(page); rc = 0; out: if (likely(lower_page)) @@ -260,6 +251,33 @@ out: ClearPageUptodate(page); return rc; } +/** + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + */ +static void set_header_info(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat) +{ + size_t written; + int save_num_header_extents_at_front = + crypt_stat->num_header_extents_at_front; + + crypt_stat->num_header_extents_at_front = 1; + ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); + crypt_stat->num_header_extents_at_front = + save_num_header_extents_at_front; +} /** * ecryptfs_readpage @@ -279,8 +297,8 @@ static int ecryptfs_readpage(struct file *file, struct page *page) crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode) ->crypt_stat; if (!crypt_stat - || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED) - || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { + || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED) + || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { ecryptfs_printk(KERN_DEBUG, "Passing through unencrypted page\n"); rc = ecryptfs_do_readpage(file, page, page->index); @@ -289,10 +307,51 @@ static int ecryptfs_readpage(struct file *file, struct page *page) "[%d]\n", rc); goto out; } + } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + int num_pages_in_header_region = + (crypt_stat->header_extent_size + / PAGE_CACHE_SIZE); + + if (page->index < num_pages_in_header_region) { + char *page_virt; + + page_virt = kmap_atomic(page, KM_USER0); + memset(page_virt, 0, PAGE_CACHE_SIZE); + if (page->index == 0) { + rc = ecryptfs_read_xattr_region( + page_virt, file->f_path.dentry); + set_header_info(page_virt, crypt_stat); + } + kunmap_atomic(page_virt, KM_USER0); + flush_dcache_page(page); + if (rc) { + printk(KERN_ERR "Error reading xattr " + "region\n"); + goto out; + } + } else { + rc = ecryptfs_do_readpage( + file, page, + (page->index + - num_pages_in_header_region)); + if (rc) { + printk(KERN_ERR "Error reading page; " + "rc = [%d]\n", rc); + goto out; + } + } + } else { + rc = ecryptfs_do_readpage(file, page, page->index); + if (rc) { + printk(KERN_ERR "Error reading page; rc = " + "[%d]\n", rc); + goto out; + } + } } else { rc = ecryptfs_decrypt_page(file, page); if (rc) { - ecryptfs_printk(KERN_ERR, "Error decrypting page; " "rc = [%d]\n", rc); goto out; @@ -308,30 +367,27 @@ out: return rc; } +/** + * Called with lower inode mutex held. + */ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) { struct inode *inode = page->mapping->host; int end_byte_in_page; - int rc = 0; char *page_virt; - if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) { - end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; - if (to > end_byte_in_page) - end_byte_in_page = to; - page_virt = kmap(page); - if (!page_virt) { - rc = -ENOMEM; - ecryptfs_printk(KERN_WARNING, - "Could not map page\n"); - goto out; - } - memset((page_virt + end_byte_in_page), 0, - (PAGE_CACHE_SIZE - end_byte_in_page)); - kunmap(page); - } + if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index) + goto out; + end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; + if (to > end_byte_in_page) + end_byte_in_page = to; + page_virt = kmap_atomic(page, KM_USER0); + memset((page_virt + end_byte_in_page), 0, + (PAGE_CACHE_SIZE - end_byte_in_page)); + kunmap_atomic(page_virt, KM_USER0); + flush_dcache_page(page); out: - return rc; + return 0; } static int ecryptfs_prepare_write(struct file *file, struct page *page, @@ -339,7 +395,6 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, { int rc = 0; - kmap(page); if (from == 0 && to == PAGE_CACHE_SIZE) goto out; /* If we are writing a full page, it will be up to date. */ @@ -349,30 +404,6 @@ out: return rc; } -int ecryptfs_grab_and_map_lower_page(struct page **lower_page, - char **lower_virt, - struct inode *lower_inode, - unsigned long lower_page_index) -{ - int rc = 0; - - (*lower_page) = grab_cache_page(lower_inode->i_mapping, - lower_page_index); - if (!(*lower_page)) { - ecryptfs_printk(KERN_ERR, "grab_cache_page for " - "lower_page_index = [0x%.16x] failed\n", - lower_page_index); - rc = -EINVAL; - goto out; - } - if (lower_virt) - (*lower_virt) = kmap((*lower_page)); - else - kmap((*lower_page)); -out: - return rc; -} - int ecryptfs_writepage_and_release_lower_page(struct page *lower_page, struct inode *lower_inode, struct writeback_control *wbc) @@ -391,11 +422,8 @@ out: return rc; } -static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page) +static void ecryptfs_release_lower_page(struct page *lower_page) { - kunmap(lower_page); - ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = " - "[0x%.16x]\n", lower_page->index); unlock_page(lower_page); page_cache_release(lower_page); } @@ -407,10 +435,9 @@ static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page) * * Returns zero on success; non-zero on error. */ -int -ecryptfs_write_inode_size_to_header(struct file *lower_file, - struct inode *lower_inode, - struct inode *inode) +static int ecryptfs_write_inode_size_to_header(struct file *lower_file, + struct inode *lower_inode, + struct inode *inode) { int rc = 0; struct page *header_page; @@ -418,11 +445,11 @@ ecryptfs_write_inode_size_to_header(struct file *lower_file, const struct address_space_operations *lower_a_ops; u64 file_size; - rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt, - lower_inode, 0); - if (rc) { - ecryptfs_printk(KERN_ERR, "grab_cache_page for header page " - "failed\n"); + header_page = grab_cache_page(lower_inode->i_mapping, 0); + if (!header_page) { + ecryptfs_printk(KERN_ERR, "grab_cache_page for " + "lower_page_index 0 failed\n"); + rc = -EINVAL; goto out; } lower_a_ops = lower_inode->i_mapping->a_ops; @@ -430,18 +457,95 @@ ecryptfs_write_inode_size_to_header(struct file *lower_file, file_size = (u64)i_size_read(inode); ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size); file_size = cpu_to_be64(file_size); + header_virt = kmap_atomic(header_page, KM_USER0); memcpy(header_virt, &file_size, sizeof(u64)); + kunmap_atomic(header_virt, KM_USER0); + flush_dcache_page(header_page); rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8); if (rc < 0) ecryptfs_printk(KERN_ERR, "Error commiting header page " "write\n"); - ecryptfs_unmap_and_release_lower_page(header_page); + ecryptfs_release_lower_page(header_page); lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); out: return rc; } +static int ecryptfs_write_inode_size_to_xattr(struct inode *lower_inode, + struct inode *inode, + struct dentry *ecryptfs_dentry, + int lower_i_mutex_held) +{ + ssize_t size; + void *xattr_virt; + struct dentry *lower_dentry; + u64 file_size; + int rc; + + xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL); + if (!xattr_virt) { + printk(KERN_ERR "Out of memory whilst attempting to write " + "inode size to xattr\n"); + rc = -ENOMEM; + goto out; + } + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + if (!lower_dentry->d_inode->i_op->getxattr) { + printk(KERN_WARNING + "No support for setting xattr in lower filesystem\n"); + rc = -ENOSYS; + kmem_cache_free(ecryptfs_xattr_cache, xattr_virt); + goto out; + } + if (!lower_i_mutex_held) + mutex_lock(&lower_dentry->d_inode->i_mutex); + size = lower_dentry->d_inode->i_op->getxattr(lower_dentry, + ECRYPTFS_XATTR_NAME, + xattr_virt, + PAGE_CACHE_SIZE); + if (!lower_i_mutex_held) + mutex_unlock(&lower_dentry->d_inode->i_mutex); + if (size < 0) + size = 8; + file_size = (u64)i_size_read(inode); + file_size = cpu_to_be64(file_size); + memcpy(xattr_virt, &file_size, sizeof(u64)); + if (!lower_i_mutex_held) + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, + ECRYPTFS_XATTR_NAME, + xattr_virt, size, 0); + if (!lower_i_mutex_held) + mutex_unlock(&lower_dentry->d_inode->i_mutex); + if (rc) + printk(KERN_ERR "Error whilst attempting to write inode size " + "to lower file xattr; rc = [%d]\n", rc); + kmem_cache_free(ecryptfs_xattr_cache, xattr_virt); +out: + return rc; +} + +int +ecryptfs_write_inode_size_to_metadata(struct file *lower_file, + struct inode *lower_inode, + struct inode *inode, + struct dentry *ecryptfs_dentry, + int lower_i_mutex_held) +{ + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return ecryptfs_write_inode_size_to_xattr(lower_inode, inode, + ecryptfs_dentry, + lower_i_mutex_held); + else + return ecryptfs_write_inode_size_to_header(lower_file, + lower_inode, + inode); +} + int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, struct file *lower_file, unsigned long lower_page_index, int byte_offset, @@ -449,10 +553,10 @@ int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, { int rc = 0; - rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode, - lower_page_index); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to grab and map " + *lower_page = grab_cache_page(lower_inode->i_mapping, lower_page_index); + if (!(*lower_page)) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Error attempting to grab " "lower page with index [0x%.16x]\n", lower_page_index); goto out; @@ -468,7 +572,7 @@ int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, } out: if (rc && (*lower_page)) { - ecryptfs_unmap_and_release_lower_page(*lower_page); + ecryptfs_release_lower_page(*lower_page); (*lower_page) = NULL; } return rc; @@ -493,7 +597,7 @@ ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode, "Error committing write; rc = [%d]\n", rc); } else rc = 0; - ecryptfs_unmap_and_release_lower_page(lower_page); + ecryptfs_release_lower_page(lower_page); return rc; } @@ -528,89 +632,7 @@ out: return rc; } -static int -process_new_file(struct ecryptfs_crypt_stat *crypt_stat, - struct file *file, struct inode *inode) -{ - struct page *header_page; - const struct address_space_operations *lower_a_ops; - struct inode *lower_inode; - struct file *lower_file; - char *header_virt; - int rc = 0; - int current_header_page = 0; - int header_pages; - int more_header_data_to_be_written = 1; - - lower_inode = ecryptfs_inode_to_lower(inode); - lower_file = ecryptfs_file_to_lower(file); - lower_a_ops = lower_inode->i_mapping->a_ops; - header_pages = ((crypt_stat->header_extent_size - * crypt_stat->num_header_extents_at_front) - / PAGE_CACHE_SIZE); - BUG_ON(header_pages < 1); - while (current_header_page < header_pages) { - rc = ecryptfs_grab_and_map_lower_page(&header_page, - &header_virt, - lower_inode, - current_header_page); - if (rc) { - ecryptfs_printk(KERN_ERR, "grab_cache_page for " - "header page [%d] failed; rc = [%d]\n", - current_header_page, rc); - goto out; - } - rc = lower_a_ops->prepare_write(lower_file, header_page, 0, - PAGE_CACHE_SIZE); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error preparing to write " - "header page out; rc = [%d]\n", rc); - goto out; - } - memset(header_virt, 0, PAGE_CACHE_SIZE); - if (more_header_data_to_be_written) { - rc = ecryptfs_write_headers_virt(header_virt, - crypt_stat, - file->f_dentry); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error " - "generating header; rc = " - "[%d]\n", rc); - rc = -EIO; - memset(header_virt, 0, PAGE_CACHE_SIZE); - ecryptfs_unmap_and_release_lower_page( - header_page); - goto out; - } - if (current_header_page == 0) - memset(header_virt, 0, 8); - more_header_data_to_be_written = 0; - } - rc = lower_a_ops->commit_write(lower_file, header_page, 0, - PAGE_CACHE_SIZE); - ecryptfs_unmap_and_release_lower_page(header_page); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, - "Error commiting header page write; " - "rc = [%d]\n", rc); - break; - } - current_header_page++; - } - if (rc >= 0) { - rc = 0; - ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = " - "[0x%.16x]\n", lower_inode->i_blocks); - i_size_write(inode, 0); - lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty_sync(inode); - } - ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in " - "crypt_stat at memory location [%p]\n", crypt_stat); - ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE); -out: - return rc; -} +struct kmem_cache *ecryptfs_xattr_cache; /** * ecryptfs_commit_write @@ -640,15 +662,10 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, mutex_lock(&lower_inode->i_mutex); crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode) ->crypt_stat; - if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { + if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in " "crypt_stat at memory location [%p]\n", crypt_stat); - rc = process_new_file(crypt_stat, file, inode); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error processing new " - "file; rc = [%d]\n", rc); - goto out; - } + crypt_stat->flags &= ~(ECRYPTFS_NEW_FILE); } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" @@ -670,7 +687,6 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, "index [0x%.16x])\n", page->index); goto out; } - rc = 0; inode->i_blocks = lower_inode->i_blocks; pos = (page->index << PAGE_CACHE_SHIFT) + to; if (pos > i_size_read(inode)) { @@ -678,11 +694,15 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16x]\n", i_size_read(inode)); } - ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode); + rc = ecryptfs_write_inode_size_to_metadata(lower_file, lower_inode, + inode, file->f_dentry, + ECRYPTFS_LOWER_I_MUTEX_HELD); + if (rc) + printk(KERN_ERR "Error writing inode size to metadata; " + "rc = [%d]\n", rc); lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); out: - kunmap(page); /* mapped in prior call (prepare_write) */ if (rc < 0) ClearPageUptodate(page); else @@ -707,6 +727,7 @@ int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) { int rc = 0; struct page *tmp_page; + char *tmp_page_virt; tmp_page = ecryptfs_get1page(file, index); if (IS_ERR(tmp_page)) { @@ -715,28 +736,27 @@ int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) rc = PTR_ERR(tmp_page); goto out; } - kmap(tmp_page); rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros); if (rc) { ecryptfs_printk(KERN_ERR, "Error preparing to write zero's " "to remainder of page at index [0x%.16x]\n", index); - kunmap(tmp_page); page_cache_release(tmp_page); goto out; } - memset(((char *)page_address(tmp_page) + start), 0, num_zeros); + tmp_page_virt = kmap_atomic(tmp_page, KM_USER0); + memset(((char *)tmp_page_virt + start), 0, num_zeros); + kunmap_atomic(tmp_page_virt, KM_USER0); + flush_dcache_page(tmp_page); rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write zero's " "to remainder of page at index [0x%.16x]\n", index); - kunmap(tmp_page); page_cache_release(tmp_page); goto out; } rc = 0; - kunmap(tmp_page); page_cache_release(tmp_page); out: return rc; diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c new file mode 100644 index 00000000000..e3aa2253c85 --- /dev/null +++ b/fs/ecryptfs/netlink.c @@ -0,0 +1,255 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> + * Tyler Hicks <tyhicks@ou.edu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <net/sock.h> +#include <linux/hash.h> +#include <linux/random.h> +#include "ecryptfs_kernel.h" + +static struct sock *ecryptfs_nl_sock; + +/** + * ecryptfs_send_netlink + * @data: The data to include as the payload + * @data_len: The byte count of the data + * @msg_ctx: The netlink context that will be used to handle the + * response message + * @msg_type: The type of netlink message to send + * @msg_flags: The flags to include in the netlink header + * @daemon_pid: The process id of the daemon to send the message to + * + * Sends the data to the specified daemon pid and uses the netlink + * context element to store the data needed for validation upon + * receiving the response. The data and the netlink context can be + * null if just sending a netlink header is sufficient. Returns zero + * upon sending the message; non-zero upon error. + */ +int ecryptfs_send_netlink(char *data, int data_len, + struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, + u16 msg_flags, pid_t daemon_pid) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct ecryptfs_message *msg; + size_t payload_len; + int rc; + + payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0); + skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL); + if (!skb) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n"); + goto out; + } + nlh = NLMSG_PUT(skb, daemon_pid, msg_ctx ? msg_ctx->counter : 0, + msg_type, payload_len); + nlh->nlmsg_flags = msg_flags; + if (msg_ctx && payload_len) { + msg = (struct ecryptfs_message *)NLMSG_DATA(nlh); + msg->index = msg_ctx->index; + msg->data_len = data_len; + memcpy(msg->data, data, data_len); + } + rc = netlink_unicast(ecryptfs_nl_sock, skb, daemon_pid, 0); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink " + "message; rc = [%d]\n", rc); + goto out; + } + rc = 0; + goto out; +nlmsg_failure: + rc = -EMSGSIZE; + kfree_skb(skb); +out: + return rc; +} + +/** + * ecryptfs_process_nl_reponse + * @skb: The socket buffer containing the netlink message of state + * RESPONSE + * + * Processes a response message after sending a operation request to + * userspace. Attempts to assign the msg to a netlink context element + * at the index specified in the msg. The sk_buff and nlmsghdr must + * be validated before this function. Returns zero upon delivery to + * desired context element; non-zero upon delivery failure or error. + */ +static int ecryptfs_process_nl_response(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data; + struct ecryptfs_message *msg = NLMSG_DATA(nlh); + int rc; + + if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Received netlink message with " + "incorrectly specified data length\n"); + goto out; + } + rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, + NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq); + if (rc) + printk(KERN_ERR + "Error processing response message; rc = [%d]\n", rc); +out: + return rc; +} + +/** + * ecryptfs_process_nl_helo + * @skb: The socket buffer containing the nlmsghdr in HELO state + * + * Gets uid and pid of the skb and adds the values to the daemon id + * hash. Returns zero after adding a new daemon id to the hash list; + * non-zero otherwise. + */ +static int ecryptfs_process_nl_helo(struct sk_buff *skb) +{ + int rc; + + rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK, + NETLINK_CREDS(skb)->uid, + NETLINK_CREDS(skb)->pid); + if (rc) + printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); + return rc; +} + +/** + * ecryptfs_process_nl_quit + * @skb: The socket buffer containing the nlmsghdr in QUIT state + * + * Gets uid and pid of the skb and deletes the corresponding daemon + * id, if it is the registered that is requesting the + * deletion. Returns zero after deleting the desired daemon id; + * non-zero otherwise. + */ +static int ecryptfs_process_nl_quit(struct sk_buff *skb) +{ + int rc; + + rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, + NETLINK_CREDS(skb)->pid); + if (rc) + printk(KERN_WARNING + "Error processing QUIT message; rc = [%d]\n", rc); + return rc; +} + +/** + * ecryptfs_receive_nl_message + * + * Callback function called by netlink system when a message arrives. + * If the message looks to be valid, then an attempt is made to assign + * it to its desired netlink context element and wake up the process + * that is waiting for a response. + */ +static void ecryptfs_receive_nl_message(struct sock *sk, int len) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int rc = 0; /* skb_recv_datagram requires this */ + +receive: + skb = skb_recv_datagram(sk, 0, 0, &rc); + if (rc == -EINTR) + goto receive; + else if (rc < 0) { + ecryptfs_printk(KERN_ERR, "Error occurred while " + "receiving eCryptfs netlink message; " + "rc = [%d]\n", rc); + return; + } + nlh = (struct nlmsghdr *)skb->data; + if (!NLMSG_OK(nlh, skb->len)) { + ecryptfs_printk(KERN_ERR, "Received corrupt netlink " + "message\n"); + goto free; + } + switch (nlh->nlmsg_type) { + case ECRYPTFS_NLMSG_RESPONSE: + if (ecryptfs_process_nl_response(skb)) { + ecryptfs_printk(KERN_WARNING, "Failed to " + "deliver netlink response to " + "requesting operation\n"); + } + break; + case ECRYPTFS_NLMSG_HELO: + if (ecryptfs_process_nl_helo(skb)) { + ecryptfs_printk(KERN_WARNING, "Failed to " + "fulfill HELO request\n"); + } + break; + case ECRYPTFS_NLMSG_QUIT: + if (ecryptfs_process_nl_quit(skb)) { + ecryptfs_printk(KERN_WARNING, "Failed to " + "fulfill QUIT request\n"); + } + break; + default: + ecryptfs_printk(KERN_WARNING, "Dropping netlink " + "message of unrecognized type [%d]\n", + nlh->nlmsg_type); + break; + } +free: + kfree_skb(skb); +} + +/** + * ecryptfs_init_netlink + * + * Initializes the daemon id hash list, netlink context array, and + * necessary locks. Returns zero upon success; non-zero upon error. + */ +int ecryptfs_init_netlink(void) +{ + int rc; + + ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0, + ecryptfs_receive_nl_message, + THIS_MODULE); + if (!ecryptfs_nl_sock) { + rc = -EIO; + ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n"); + goto out; + } + ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT; + rc = 0; +out: + return rc; +} + +/** + * ecryptfs_release_netlink + * + * Frees all memory used by the netlink context array and releases the + * netlink socket. + */ +void ecryptfs_release_netlink(void) +{ + if (ecryptfs_nl_sock && ecryptfs_nl_sock->sk_socket) + sock_release(ecryptfs_nl_sock->sk_socket); + ecryptfs_nl_sock = NULL; +} diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index eaa5daaf106..7b3f0cc09a6 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -168,7 +168,7 @@ out: return rc; } -struct super_operations ecryptfs_sops = { +const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, .drop_inode = generic_delete_inode, diff --git a/fs/efs/dir.c b/fs/efs/dir.c index b46c488eefc..dfb5cb40021 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -15,7 +15,7 @@ const struct file_operations efs_dir_operations = { .readdir = efs_readdir, }; -struct inode_operations efs_dir_inode_operations = { +const struct inode_operations efs_dir_inode_operations = { .lookup = efs_lookup, }; diff --git a/fs/efs/super.c b/fs/efs/super.c index dfebf21289f..c2235e46edc 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -105,7 +105,7 @@ static int efs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations efs_superblock_operations = { +static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .destroy_inode = efs_destroy_inode, .read_inode = efs_read_inode, diff --git a/fs/exec.c b/fs/exec.c index 11fe93f7363..7e36c6f6f53 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -405,12 +405,10 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!mpnt) return -ENOMEM; - memset(mpnt, 0, sizeof(*mpnt)); - down_write(&mm->mmap_sem); { mpnt->vm_mm = mm; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 0b02ba9642d..e89bfc8cf95 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -368,6 +368,14 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, } if (++n >= npages) n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + ext2_error(dir->i_sb, __FUNCTION__, + "dir %lu size %lld exceeds block count %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } } while (n != start); out: return NULL; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index c19ac153f56..e2a0ea50af1 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -158,7 +158,7 @@ extern void ext2_write_super (struct super_block *); extern const struct file_operations ext2_dir_operations; /* file.c */ -extern struct inode_operations ext2_file_inode_operations; +extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; extern const struct file_operations ext2_xip_file_operations; @@ -168,9 +168,9 @@ extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ -extern struct inode_operations ext2_dir_inode_operations; -extern struct inode_operations ext2_special_inode_operations; +extern const struct inode_operations ext2_dir_inode_operations; +extern const struct inode_operations ext2_special_inode_operations; /* symlink.c */ -extern struct inode_operations ext2_fast_symlink_inode_operations; -extern struct inode_operations ext2_symlink_inode_operations; +extern const struct inode_operations ext2_fast_symlink_inode_operations; +extern const struct inode_operations ext2_symlink_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2dba473c524..566d4e2d385 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -75,7 +75,7 @@ const struct file_operations ext2_xip_file_operations = { }; #endif -struct inode_operations ext2_file_inode_operations = { +const struct inode_operations ext2_file_inode_operations = { .truncate = ext2_truncate, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e1af5b4cf80..e69beed839a 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -373,7 +373,7 @@ out: return err; } -struct inode_operations ext2_dir_inode_operations = { +const struct inode_operations ext2_dir_inode_operations = { .create = ext2_create, .lookup = ext2_lookup, .link = ext2_link, @@ -393,7 +393,7 @@ struct inode_operations ext2_dir_inode_operations = { .permission = ext2_permission, }; -struct inode_operations ext2_special_inode_operations = { +const struct inode_operations ext2_special_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 6347c2dbdd8..a046a419d8a 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -231,7 +231,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); #endif -static struct super_operations ext2_sops = { +static const struct super_operations ext2_sops = { .alloc_inode = ext2_alloc_inode, .destroy_inode = ext2_destroy_inode, .read_inode = ext2_read_inode, @@ -708,10 +708,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT2_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT2_FS_XATTR if (def_mount_opts & EXT2_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL if (def_mount_opts & EXT2_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 1e67d87cfa9..4e2426e22bb 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -28,7 +28,7 @@ static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations ext2_symlink_inode_operations = { +const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -40,7 +40,7 @@ struct inode_operations ext2_symlink_inode_operations = { #endif }; -struct inode_operations ext2_fast_symlink_inode_operations = { +const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext2_follow_link, #ifdef CONFIG_EXT2_FS_XATTR diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 881f6365c41..1e6f1386453 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -125,7 +125,7 @@ const struct file_operations ext3_file_operations = { .splice_write = generic_file_splice_write, }; -struct inode_operations ext3_file_inode_operations = { +const struct inode_operations ext3_file_inode_operations = { .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index deeb27b5ba8..c30e149fbd2 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/jbd.h> -#include <linux/sched.h> #include <linux/ext3_fs.h> #include <linux/cryptohash.h> diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index beaf25f5112..8a824f4ce5c 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -947,7 +947,7 @@ out: static int ext3_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = journal_current_handle(); + handle_t *handle = ext3_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -1717,7 +1717,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, /* * Reacquire the handle: ext3_get_block() can restart the transaction */ - handle = journal_current_handle(); + handle = ext3_journal_current_handle(); out_stop: if (handle) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 4df39c4315e..49159f13cc1 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1618,21 +1618,6 @@ static int ext3_delete_entry (handle_t *handle, return -ENOENT; } -/* - * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we - * do not perform it in these functions. We perform it at the call site, - * if it is needed. - */ -static inline void ext3_inc_count(handle_t *handle, struct inode *inode) -{ - inc_nlink(inode); -} - -static inline void ext3_dec_count(handle_t *handle, struct inode *inode) -{ - drop_nlink(inode); -} - static int ext3_add_nondir(handle_t *handle, struct dentry *dentry, struct inode *inode) { @@ -1642,7 +1627,7 @@ static int ext3_add_nondir(handle_t *handle, d_instantiate(dentry, inode); return 0; } - ext3_dec_count(handle, inode); + drop_nlink(inode); iput(inode); return err; } @@ -2163,7 +2148,7 @@ retry: err = __page_symlink(inode, symname, l, mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { - ext3_dec_count(handle, inode); + drop_nlink(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2191,6 +2176,12 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ + if (inode->i_nlink == 0) + return -ENOENT; retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2202,7 +2193,7 @@ retry: handle->h_sync = 1; inode->i_ctime = CURRENT_TIME_SEC; - ext3_inc_count(handle, inode); + inc_nlink(inode); atomic_inc(&inode->i_count); err = ext3_add_nondir(handle, dentry, inode); @@ -2374,7 +2365,7 @@ end_rename: /* * directories can handle most operations... */ -struct inode_operations ext3_dir_inode_operations = { +const struct inode_operations ext3_dir_inode_operations = { .create = ext3_create, .lookup = ext3_lookup, .link = ext3_link, @@ -2394,7 +2385,7 @@ struct inode_operations ext3_dir_inode_operations = { .permission = ext3_permission, }; -struct inode_operations ext3_special_inode_operations = { +const struct inode_operations ext3_special_inode_operations = { .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index b73cba12f79..ecf89904c11 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -11,7 +11,6 @@ #define EXT3FS_DEBUG -#include <linux/sched.h> #include <linux/smp_lock.h> #include <linux/ext3_jbd.h> diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b34886734a4..4a4fcd6868c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -639,7 +639,7 @@ static struct quotactl_ops ext3_qctl_operations = { }; #endif -static struct super_operations ext3_sops = { +static const struct super_operations ext3_sops = { .alloc_inode = ext3_alloc_inode, .destroy_inode = ext3_destroy_inode, .read_inode = ext3_read_inode, @@ -1459,10 +1459,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT3_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT3_FS_XATTR if (def_mount_opts & EXT3_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL if (def_mount_opts & EXT3_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) @@ -2344,6 +2348,22 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) err = -EROFS; goto restore_opts; } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + printk(KERN_WARNING "EXT3-fs: %s: couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead.\n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 4f79122cde6..ff7b4ccd898 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -30,7 +30,7 @@ static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations ext3_symlink_inode_operations = { +const struct inode_operations ext3_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -42,7 +42,7 @@ struct inode_operations ext3_symlink_inode_operations = { #endif }; -struct inode_operations ext3_fast_symlink_inode_operations = { +const struct inode_operations ext3_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext3_follow_link, #ifdef CONFIG_EXT3_FS_XATTR diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dc2724fa762..7916b50f9a1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -222,7 +222,7 @@ static int ext4_ext_space_block(struct inode *inode) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); -#ifdef AGRESSIVE_TEST +#ifdef AGGRESSIVE_TEST if (size > 6) size = 6; #endif @@ -235,7 +235,7 @@ static int ext4_ext_space_block_idx(struct inode *inode) size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); -#ifdef AGRESSIVE_TEST +#ifdef AGGRESSIVE_TEST if (size > 5) size = 5; #endif @@ -249,7 +249,7 @@ static int ext4_ext_space_root(struct inode *inode) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); -#ifdef AGRESSIVE_TEST +#ifdef AGGRESSIVE_TEST if (size > 3) size = 3; #endif @@ -263,7 +263,7 @@ static int ext4_ext_space_root_idx(struct inode *inode) size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); -#ifdef AGRESSIVE_TEST +#ifdef AGGRESSIVE_TEST if (size > 4) size = 4; #endif @@ -1118,7 +1118,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) return 0; -#ifdef AGRESSIVE_TEST +#ifdef AGGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) return 0; #endif @@ -1891,8 +1891,8 @@ void ext4_ext_init(struct super_block *sb) if (test_opt(sb, EXTENTS)) { printk("EXT4-fs: file extents enabled"); -#ifdef AGRESSIVE_TEST - printk(", agressive tests"); +#ifdef AGGRESSIVE_TEST + printk(", aggressive tests"); #endif #ifdef CHECK_BINSEARCH printk(", check binsearch"); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3bbc24b5878..3c6c1fd2be9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ const struct file_operations ext4_file_operations = { .splice_write = generic_file_splice_write, }; -struct inode_operations ext4_file_inode_operations = { +const struct inode_operations ext4_file_inode_operations = { .truncate = ext4_truncate, .setattr = ext4_setattr, #ifdef CONFIG_EXT4DEV_FS_XATTR diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index a67966385e0..1555024e3b3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/sched.h> #include <linux/ext4_fs.h> #include <linux/cryptohash.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a127cc03c9f..fbff4b9e122 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -946,7 +946,7 @@ out: static int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = journal_current_handle(); + handle_t *handle = ext4_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -1716,7 +1716,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, /* * Reacquire the handle: ext4_get_block() can restart the transaction */ - handle = journal_current_handle(); + handle = ext4_journal_current_handle(); out_stop: if (handle) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e5a74a5ac26..e7e1d79a7d7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1616,21 +1616,6 @@ static int ext4_delete_entry (handle_t *handle, return -ENOENT; } -/* - * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we - * do not perform it in these functions. We perform it at the call site, - * if it is needed. - */ -static inline void ext4_inc_count(handle_t *handle, struct inode *inode) -{ - inc_nlink(inode); -} - -static inline void ext4_dec_count(handle_t *handle, struct inode *inode) -{ - drop_nlink(inode); -} - static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode *inode) { @@ -1640,7 +1625,7 @@ static int ext4_add_nondir(handle_t *handle, d_instantiate(dentry, inode); return 0; } - ext4_dec_count(handle, inode); + drop_nlink(inode); iput(inode); return err; } @@ -2161,7 +2146,7 @@ retry: err = __page_symlink(inode, symname, l, mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { - ext4_dec_count(handle, inode); + drop_nlink(inode); ext4_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2189,6 +2174,12 @@ static int ext4_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ + if (inode->i_nlink == 0) + return -ENOENT; retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + @@ -2200,7 +2191,7 @@ retry: handle->h_sync = 1; inode->i_ctime = CURRENT_TIME_SEC; - ext4_inc_count(handle, inode); + inc_nlink(inode); atomic_inc(&inode->i_count); err = ext4_add_nondir(handle, dentry, inode); @@ -2372,7 +2363,7 @@ end_rename: /* * directories can handle most operations... */ -struct inode_operations ext4_dir_inode_operations = { +const struct inode_operations ext4_dir_inode_operations = { .create = ext4_create, .lookup = ext4_lookup, .link = ext4_link, @@ -2392,7 +2383,7 @@ struct inode_operations ext4_dir_inode_operations = { .permission = ext4_permission, }; -struct inode_operations ext4_special_inode_operations = { +const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, #ifdef CONFIG_EXT4DEV_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 4fe49c3661b..ea99f6c97f5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -11,7 +11,6 @@ #define EXT4FS_DEBUG -#include <linux/sched.h> #include <linux/smp_lock.h> #include <linux/ext4_jbd2.h> diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 486a641ca71..61c4718e4a5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -690,7 +690,7 @@ static struct quotactl_ops ext4_qctl_operations = { }; #endif -static struct super_operations ext4_sops = { +static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .destroy_inode = ext4_destroy_inode, .read_inode = ext4_read_inode, @@ -1518,10 +1518,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT4DEV_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -2419,6 +2423,22 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) err = -EROFS; goto restore_opts; } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + printk(KERN_WARNING "EXT4-fs: %s: couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead.\n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index fcf527286d7..e6f9da4287c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,7 @@ static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations ext4_symlink_inode_operations = { +const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -42,7 +42,7 @@ struct inode_operations ext4_symlink_inode_operations = { #endif }; -struct inode_operations ext4_fast_symlink_inode_operations = { +const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, #ifdef CONFIG_EXT4DEV_FS_XATTR diff --git a/fs/fat/file.c b/fs/fat/file.c index c1237b70c1f..55d3c7461c5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -312,7 +312,7 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) } EXPORT_SYMBOL_GPL(fat_getattr); -struct inode_operations fat_file_inode_operations = { +const struct inode_operations fat_file_inode_operations = { .truncate = fat_truncate, .setattr = fat_notify_change, .getattr = fat_getattr, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a9e4688582a..76107354421 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -618,7 +618,7 @@ int fat_sync_inode(struct inode *inode) EXPORT_SYMBOL_GPL(fat_sync_inode); static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); -static struct super_operations fat_sops = { +static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, .destroy_inode = fat_destroy_inode, .write_inode = fat_write_inode, @@ -1151,7 +1151,7 @@ static int fat_read_root(struct inode *inode) * Read the super block of an MS-DOS FS. */ int fat_fill_super(struct super_block *sb, void *data, int silent, - struct inode_operations *fs_dir_inode_ops, int isvfat) + const struct inode_operations *fs_dir_inode_ops, int isvfat) { struct inode *root_inode = NULL; struct buffer_head *bh; diff --git a/fs/filesystems.c b/fs/filesystems.c index e3fa77c6ed5..7a4f61aa05f 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -12,7 +12,6 @@ #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/sched.h> /* for 'current' */ #include <asm/uaccess.h> /* diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index 1cf1fe8466a..91ccee8723f 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -62,7 +62,7 @@ extern void vxfs_read_inode(struct inode *); extern void vxfs_clear_inode(struct inode *); /* vxfs_lookup.c */ -extern struct inode_operations vxfs_dir_inode_ops; +extern const struct inode_operations vxfs_dir_inode_ops; extern const struct file_operations vxfs_dir_operations; /* vxfs_olt.c */ diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index 4e25f3fbed8..24b5a775ff9 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -48,7 +48,7 @@ static int vxfs_immed_readpage(struct file *, struct page *); * Unliked all other operations we do not go through the pagecache, * but do all work directly on the inode. */ -struct inode_operations vxfs_immed_symlink_iops = { +const struct inode_operations vxfs_immed_symlink_iops = { .readlink = generic_readlink, .follow_link = vxfs_immed_follow_link, }; diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 0b7ae897cb7..098a915fd9a 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -44,7 +44,7 @@ extern const struct address_space_operations vxfs_aops; extern const struct address_space_operations vxfs_immed_aops; -extern struct inode_operations vxfs_immed_symlink_iops; +extern const struct inode_operations vxfs_immed_symlink_iops; struct kmem_cache *vxfs_inode_cachep; diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 3995d7fbeda..bf86e5444ea 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -52,7 +52,7 @@ static struct dentry * vxfs_lookup(struct inode *, struct dentry *, struct nameidata *); static int vxfs_readdir(struct file *, void *, filldir_t); -struct inode_operations vxfs_dir_inode_ops = { +const struct inode_operations vxfs_dir_inode_ops = { .lookup = vxfs_lookup, }; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index ac28b0835ff..647d600f0bc 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -59,7 +59,7 @@ static void vxfs_put_super(struct super_block *); static int vxfs_statfs(struct dentry *, struct kstatfs *); static int vxfs_remount(struct super_block *, int *, char *); -static struct super_operations vxfs_super_ops = { +static const struct super_operations vxfs_super_ops = { .read_inode = vxfs_read_inode, .clear_inode = vxfs_clear_inode, .put_super = vxfs_put_super, diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 1794305f9ed..105d4a271e0 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -73,7 +73,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, int mode, int nlink, - struct inode_operations *iop, + const struct inode_operations *iop, const struct file_operations *fop) { struct dentry *dentry; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 40080477ceb..406bf61ed51 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1242,7 +1242,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) return err; } -static struct inode_operations fuse_dir_inode_operations = { +static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, .symlink = fuse_symlink, @@ -1270,7 +1270,7 @@ static const struct file_operations fuse_dir_operations = { .fsync = fuse_dir_fsync, }; -static struct inode_operations fuse_common_inode_operations = { +static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, @@ -1280,7 +1280,7 @@ static struct inode_operations fuse_common_inode_operations = { .removexattr = fuse_removexattr, }; -static struct inode_operations fuse_symlink_inode_operations = { +static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .follow_link = fuse_follow_link, .put_link = fuse_put_link, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f63efe1337e..2fd06927e85 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -69,7 +69,7 @@ void fuse_finish_open(struct inode *inode, struct file *file, if (outarg->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages(inode->i_mapping); + invalidate_mapping_pages(inode->i_mapping, 0, -1); ff->fh = outarg->fh; file->private_data = ff; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 12450d2b320..5ab8e50e780 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -112,7 +112,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) { struct fuse_conn *fc = get_fuse_conn(inode); if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size) - invalidate_inode_pages(inode->i_mapping); + invalidate_mapping_pages(inode->i_mapping, 0, -1); inode->i_ino = attr->ino; inode->i_mode = (inode->i_mode & S_IFMT) + (attr->mode & 07777); @@ -446,7 +446,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) return fuse_iget(sb, 1, 0, &attr); } -static struct super_operations fuse_super_operations = { +static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .read_inode = fuse_read_inode, diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 6a2ffa2db14..de8e64c03f7 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,44 +4,43 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 help - A cluster filesystem. + A cluster filesystem. - Allows a cluster of computers to simultaneously use a block device - that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads - and writes to the block device like a local filesystem, but also uses - a lock module to allow the computers coordinate their I/O so - filesystem consistency is maintained. One of the nifty features of - GFS is perfect consistency -- changes made to the filesystem on one - machine show up immediately on all other machines in the cluster. + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. - To use the GFS2 filesystem, you will need to enable one or more of - the below locking modules. Documentation and utilities for GFS2 can - be found here: http://sources.redhat.com/cluster + To use the GFS2 filesystem, you will need to enable one or more of + the below locking modules. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster config GFS2_FS_LOCKING_NOLOCK tristate "GFS2 \"nolock\" locking module" depends on GFS2_FS help - Single node locking module for GFS2. + Single node locking module for GFS2. - Use this module if you want to use GFS2 on a single node without - its clustering features. You can still take advantage of the - large file support, and upgrade to running a full cluster later on - if required. + Use this module if you want to use GFS2 on a single node without + its clustering features. You can still take advantage of the + large file support, and upgrade to running a full cluster later on + if required. - If you will only be using GFS2 in cluster mode, you do not need this - module. + If you will only be using GFS2 in cluster mode, you do not need this + module. config GFS2_FS_LOCKING_DLM tristate "GFS2 DLM locking module" - depends on GFS2_FS && NET && INET && (IPV6 || IPV6=n) + depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) select IP_SCTP if DLM_SCTP select CONFIGFS_FS select DLM help - Multiple node locking module for GFS2 - - Most users of GFS2 will require this module. It provides the locking - interface between GFS2 and the DLM, which is required to use GFS2 - in a cluster environment. + Multiple node locking module for GFS2 + Most users of GFS2 will require this module. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8240c1ff94f..c53a5d2d059 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -773,7 +772,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_free_data(ip, bstart, blen); } - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, dibh->b_data); @@ -848,7 +847,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) } ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) @@ -963,7 +962,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (gfs2_is_stuffed(ip)) { ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); @@ -975,7 +974,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (!error) { ip->i_di.di_size = size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1048,7 +1047,7 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_num.no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 0fdcb7713cd..82a1ac7895a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -53,7 +53,6 @@ * but never before the maximum hash table size has been reached. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> @@ -131,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_di.di_size < offset + size) ip->i_di.di_size = offset + size; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -229,7 +228,7 @@ out: if (ip->i_di.di_size < offset + copied) ip->i_di.di_size = offset + copied; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1198,12 +1197,11 @@ static int compare_dents(const void *a, const void *b) */ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, - void *opaque, gfs2_filldir_t filldir, + void *opaque, filldir_t filldir, const struct gfs2_dirent **darr, u32 entries, int *copied) { const struct gfs2_dirent *dent, *dent_next; - struct gfs2_inum_host inum; u64 off, off_next; unsigned int x, y; int run = 0; @@ -1240,11 +1238,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, *offset = off; } - gfs2_inum_in(&inum, (char *)&dent->de_inum); - error = filldir(opaque, (const char *)(dent + 1), be16_to_cpu(dent->de_name_len), - off, &inum, + off, be64_to_cpu(dent->de_inum.no_addr), be16_to_cpu(dent->de_type)); if (error) return 1; @@ -1262,8 +1258,8 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, } static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir, int *copied, - unsigned *depth, u64 leaf_no) + filldir_t filldir, int *copied, unsigned *depth, + u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh; @@ -1343,7 +1339,7 @@ out: */ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir) + filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1402,7 +1398,7 @@ out: } int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - gfs2_filldir_t filldir) + filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); struct dirent_gather g; @@ -1568,7 +1564,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, break; gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_di.di_entries++; - ip->i_inode.i_mtime.tv_sec = ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; @@ -1654,7 +1650,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) gfs2_consist_inode(dip); gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_di.di_entries--; - dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(dip, bh->b_data); brelse(bh); mark_inode_dirty(&dip->i_inode); @@ -1702,7 +1698,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_trans_add_bh(dip->i_gl, bh, 1); } - dip->i_inode.i_mtime.tv_sec = dip->i_inode.i_ctime.tv_sec = get_seconds(); + dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_dinode_out(dip, bh->b_data); brelse(bh); return 0; diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index b21b33668a5..48fe89046bb 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,30 +16,13 @@ struct inode; struct gfs2_inode; struct gfs2_inum; -/** - * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read() - * @opaque: opaque data used by the function - * @name: the name of the directory entry - * @length: the length of the name - * @offset: the entry's offset in the directory - * @inum: the inode number the entry points to - * @type: the type of inode the entry points to - * - * Returns: 0 on success, 1 if buffer full - */ - -typedef int (*gfs2_filldir_t) (void *opaque, - const char *name, unsigned int length, - u64 offset, - struct gfs2_inum_host *inum, unsigned int type); - int gfs2_dir_search(struct inode *dir, const struct qstr *filename, struct gfs2_inum_host *inum, unsigned int *type); int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inum_host *inum, unsigned int type); int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); -int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque, - gfs2_filldir_t filldir); +int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir); int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, struct gfs2_inum_host *new_inum, unsigned int new_type); diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c index cd747c00f67..c1f44009853 100644 --- a/fs/gfs2/eaops.c +++ b/fs/gfs2/eaops.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index ebebbdcd705..5b83ca6acab 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -301,7 +300,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -718,7 +717,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -853,7 +852,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); ip->i_inode.i_mode = er->er_mode; } - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1134,7 +1133,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 438146904b5..6618c119025 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -19,6 +19,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/list.h> #include <linux/lm_interface.h> +#include <linux/wait.h> +#include <linux/rwsem.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -33,11 +35,6 @@ #include "super.h" #include "util.h" -struct greedy { - struct gfs2_holder gr_gh; - struct delayed_work gr_work; -}; - struct gfs2_gl_hash_bucket { struct hlist_head hb_list; }; @@ -47,6 +44,9 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); static int dump_glock(struct gfs2_glock *gl); static int dump_inode(struct gfs2_inode *ip); +static void gfs2_glock_xmote_th(struct gfs2_holder *gh); +static void gfs2_glock_drop_th(struct gfs2_glock *gl); +static DECLARE_RWSEM(gfs2_umount_flush_sem); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) @@ -213,30 +213,6 @@ out: } /** - * queue_empty - check to see if a glock's queue is empty - * @gl: the glock - * @head: the head of the queue to check - * - * This function protects the list in the event that a process already - * has a holder on the list and is adding a second holder for itself. - * The glmutex lock is what generally prevents processes from working - * on the same glock at once, but the special case of adding a second - * holder for yourself ("recursive" locking) doesn't involve locking - * glmutex, making the spin lock necessary. - * - * Returns: 1 if the queue is empty - */ - -static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head) -{ - int empty; - spin_lock(&gl->gl_spin); - empty = list_empty(head); - spin_unlock(&gl->gl_spin); - return empty; -} - -/** * search_bucket() - Find struct gfs2_glock by lock number * @bucket: the bucket to search * @name: The lock name @@ -395,11 +371,6 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, gh->gh_flags = flags; gh->gh_error = 0; gh->gh_iflags = 0; - init_completion(&gh->gh_wait); - - if (gh->gh_state == LM_ST_EXCLUSIVE) - gh->gh_flags |= GL_LOCAL_EXCL; - gfs2_glock_hold(gl); } @@ -417,9 +388,6 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * { gh->gh_state = state; gh->gh_flags = flags; - if (gh->gh_state == LM_ST_EXCLUSIVE) - gh->gh_flags |= GL_LOCAL_EXCL; - gh->gh_iflags &= 1 << HIF_ALLOCED; gh->gh_ip = (unsigned long)__builtin_return_address(0); } @@ -479,6 +447,29 @@ static void gfs2_holder_put(struct gfs2_holder *gh) kfree(gh); } +static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh) +{ + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) { + gfs2_holder_put(gh); + return; + } + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); +} + +static int holder_wait(void *word) +{ + schedule(); + return 0; +} + +static void wait_on_holder(struct gfs2_holder *gh) +{ + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, holder_wait, TASK_UNINTERRUPTIBLE); +} + /** * rq_mutex - process a mutex request in the queue * @gh: the glock holder @@ -493,7 +484,9 @@ static int rq_mutex(struct gfs2_holder *gh) list_del_init(&gh->gh_list); /* gh->gh_error never examined. */ set_bit(GLF_LOCK, &gl->gl_flags); - complete(&gh->gh_wait); + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); return 1; } @@ -511,7 +504,6 @@ static int rq_promote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { if (list_empty(&gl->gl_holders)) { @@ -526,7 +518,7 @@ static int rq_promote(struct gfs2_holder *gh) gfs2_reclaim_glock(sdp); } - glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + gfs2_glock_xmote_th(gh); spin_lock(&gl->gl_spin); } return 1; @@ -537,11 +529,11 @@ static int rq_promote(struct gfs2_holder *gh) set_bit(GLF_LOCK, &gl->gl_flags); } else { struct gfs2_holder *next_gh; - if (gh->gh_flags & GL_LOCAL_EXCL) + if (gh->gh_state == LM_ST_EXCLUSIVE) return 1; next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); - if (next_gh->gh_flags & GL_LOCAL_EXCL) + if (next_gh->gh_state == LM_ST_EXCLUSIVE) return 1; } @@ -549,7 +541,7 @@ static int rq_promote(struct gfs2_holder *gh) gh->gh_error = 0; set_bit(HIF_HOLDER, &gh->gh_iflags); - complete(&gh->gh_wait); + gfs2_holder_dispose_or_wake(gh); return 0; } @@ -564,7 +556,6 @@ static int rq_promote(struct gfs2_holder *gh) static int rq_demote(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - const struct gfs2_glock_operations *glops = gl->gl_ops; if (!list_empty(&gl->gl_holders)) return 1; @@ -573,10 +564,7 @@ static int rq_demote(struct gfs2_holder *gh) list_del_init(&gh->gh_list); gh->gh_error = 0; spin_unlock(&gl->gl_spin); - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); + gfs2_holder_dispose_or_wake(gh); spin_lock(&gl->gl_spin); } else { gl->gl_req_gh = gh; @@ -585,9 +573,9 @@ static int rq_demote(struct gfs2_holder *gh) if (gh->gh_state == LM_ST_UNLOCKED || gl->gl_state != LM_ST_EXCLUSIVE) - glops->go_drop_th(gl); + gfs2_glock_drop_th(gl); else - glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + gfs2_glock_xmote_th(gh); spin_lock(&gl->gl_spin); } @@ -596,30 +584,6 @@ static int rq_demote(struct gfs2_holder *gh) } /** - * rq_greedy - process a queued request to drop greedy status - * @gh: the glock holder - * - * Returns: 1 if the queue is blocked - */ - -static int rq_greedy(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - - list_del_init(&gh->gh_list); - /* gh->gh_error never examined. */ - clear_bit(GLF_GREEDY, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - gfs2_holder_uninit(gh); - kfree(container_of(gh, struct greedy, gr_gh)); - - spin_lock(&gl->gl_spin); - - return 0; -} - -/** * run_queue - process holder structures on a glock * @gl: the glock * @@ -649,8 +613,6 @@ static void run_queue(struct gfs2_glock *gl) if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) blocked = rq_demote(gh); - else if (test_bit(HIF_GREEDY, &gh->gh_iflags)) - blocked = rq_greedy(gh); else gfs2_assert_warn(gl->gl_sbd, 0); @@ -684,6 +646,8 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl) gfs2_holder_init(gl, 0, 0, &gh); set_bit(HIF_MUTEX, &gh.gh_iflags); + if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags)) + BUG(); spin_lock(&gl->gl_spin); if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { @@ -691,11 +655,13 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl) } else { gl->gl_owner = current; gl->gl_ip = (unsigned long)__builtin_return_address(0); - complete(&gh.gh_wait); + clear_bit(HIF_WAIT, &gh.gh_iflags); + smp_mb(); + wake_up_bit(&gh.gh_iflags, HIF_WAIT); } spin_unlock(&gl->gl_spin); - wait_for_completion(&gh.gh_wait); + wait_on_holder(&gh); gfs2_holder_uninit(&gh); } @@ -774,6 +740,7 @@ restart: return; set_bit(HIF_DEMOTE, &new_gh->gh_iflags); set_bit(HIF_DEALLOC, &new_gh->gh_iflags); + set_bit(HIF_WAIT, &new_gh->gh_iflags); goto restart; } @@ -825,7 +792,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) int op_done = 1; gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); state_change(gl, ret & LM_OUT_ST_MASK); @@ -908,12 +875,8 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) gfs2_glock_put(gl); - if (gh) { - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); - } + if (gh) + gfs2_holder_dispose_or_wake(gh); } /** @@ -924,23 +887,26 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) * */ -void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags) +void gfs2_glock_xmote_th(struct gfs2_holder *gh) { + struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; + int flags = gh->gh_flags; + unsigned state = gh->gh_state; const struct gfs2_glock_operations *glops = gl->gl_ops; int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_ANY | LM_FLAG_PRIORITY); unsigned int lck_ret; + if (glops->go_xmote_th) + glops->go_xmote_th(gl); + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); gfs2_assert_warn(sdp, state != gl->gl_state); - if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) - glops->go_sync(gl); - gfs2_glock_hold(gl); gl->gl_req_bh = xmote_bh; @@ -971,10 +937,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh = gl->gl_req_gh; - clear_bit(GLF_PREFETCH, &gl->gl_flags); - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, !ret); state_change(gl, LM_ST_UNLOCKED); @@ -1001,12 +965,8 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) gfs2_glock_put(gl); - if (gh) { - if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) - gfs2_holder_put(gh); - else - complete(&gh->gh_wait); - } + if (gh) + gfs2_holder_dispose_or_wake(gh); } /** @@ -1015,19 +975,19 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret) * */ -void gfs2_glock_drop_th(struct gfs2_glock *gl) +static void gfs2_glock_drop_th(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned int ret; + if (glops->go_drop_th) + glops->go_drop_th(gl); + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); - if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) - glops->go_sync(gl); - gfs2_glock_hold(gl); gl->gl_req_bh = drop_bh; @@ -1107,8 +1067,7 @@ static int glock_wait_internal(struct gfs2_holder *gh) if (gh->gh_flags & LM_FLAG_PRIORITY) do_cancels(gh); - wait_for_completion(&gh->gh_wait); - + wait_on_holder(gh); if (gh->gh_error) return gh->gh_error; @@ -1164,6 +1123,8 @@ static void add_to_queue(struct gfs2_holder *gh) struct gfs2_holder *existing; BUG_ON(!gh->gh_owner); + if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) + BUG(); existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); if (existing) { @@ -1227,8 +1188,6 @@ restart: } } - clear_bit(GLF_PREFETCH, &gl->gl_flags); - return error; } @@ -1321,98 +1280,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } /** - * gfs2_glock_prefetch - Try to prefetch a glock - * @gl: the glock - * @state: the state to prefetch in - * @flags: flags passed to go_xmote_th() - * - */ - -static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, - int flags) -{ - const struct gfs2_glock_operations *glops = gl->gl_ops; - - spin_lock(&gl->gl_spin); - - if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) || - !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) || - !list_empty(&gl->gl_waiters3) || - relaxed_state_ok(gl->gl_state, state, flags)) { - spin_unlock(&gl->gl_spin); - return; - } - - set_bit(GLF_PREFETCH, &gl->gl_flags); - set_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - glops->go_xmote_th(gl, state, flags); -} - -static void greedy_work(struct work_struct *work) -{ - struct greedy *gr = container_of(work, struct greedy, gr_work.work); - struct gfs2_holder *gh = &gr->gr_gh; - struct gfs2_glock *gl = gh->gh_gl; - const struct gfs2_glock_operations *glops = gl->gl_ops; - - clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); - - if (glops->go_greedy) - glops->go_greedy(gl); - - spin_lock(&gl->gl_spin); - - if (list_empty(&gl->gl_waiters2)) { - clear_bit(GLF_GREEDY, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - gfs2_holder_uninit(gh); - kfree(gr); - } else { - gfs2_glock_hold(gl); - list_add_tail(&gh->gh_list, &gl->gl_waiters2); - run_queue(gl); - spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); - } -} - -/** - * gfs2_glock_be_greedy - - * @gl: - * @time: - * - * Returns: 0 if go_greedy will be called, 1 otherwise - */ - -int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time) -{ - struct greedy *gr; - struct gfs2_holder *gh; - - if (!time || gl->gl_sbd->sd_args.ar_localcaching || - test_and_set_bit(GLF_GREEDY, &gl->gl_flags)) - return 1; - - gr = kmalloc(sizeof(struct greedy), GFP_KERNEL); - if (!gr) { - clear_bit(GLF_GREEDY, &gl->gl_flags); - return 1; - } - gh = &gr->gr_gh; - - gfs2_holder_init(gl, 0, 0, gh); - set_bit(HIF_GREEDY, &gh->gh_iflags); - INIT_DELAYED_WORK(&gr->gr_work, greedy_work); - - set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); - schedule_delayed_work(&gr->gr_work, time); - - return 0; -} - -/** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure * @@ -1470,10 +1337,7 @@ static int glock_compare(const void *arg_a, const void *arg_b) return 1; if (a->ln_number < b->ln_number) return -1; - if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE) - return 1; - if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL)) - return 1; + BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); return 0; } @@ -1618,34 +1482,6 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) } /** - * gfs2_glock_prefetch_num - prefetch a glock based on lock number - * @sdp: the filesystem - * @number: the lock number - * @glops: the glock operations for the type of glock - * @state: the state to acquire the glock in - * @flags: modifier flags for the aquisition - * - * Returns: errno - */ - -void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, int flags) -{ - struct gfs2_glock *gl; - int error; - - if (atomic_read(&sdp->sd_reclaim_count) < - gfs2_tune_get(sdp, gt_reclaim_limit)) { - error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); - if (!error) { - gfs2_glock_prefetch(gl, state, flags); - gfs2_glock_put(gl); - } - } -} - -/** * gfs2_lvb_hold - attach a LVB from a glock * @gl: The glock in question * @@ -1703,8 +1539,6 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, if (!gl) return; - if (gl->gl_ops->go_callback) - gl->gl_ops->go_callback(gl, state); handle_callback(gl, state); spin_lock(&gl->gl_spin); @@ -1746,12 +1580,14 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) struct lm_async_cb *async = data; struct gfs2_glock *gl; + down_read(&gfs2_umount_flush_sem); gl = gfs2_glock_find(sdp, &async->lc_name); if (gfs2_assert_warn(sdp, gl)) return; if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) gl->gl_req_bh(gl, async->lc_ret); gfs2_glock_put(gl); + up_read(&gfs2_umount_flush_sem); return; } @@ -1781,15 +1617,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) static int demote_ok(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; const struct gfs2_glock_operations *glops = gl->gl_ops; int demote = 1; if (test_bit(GLF_STICKY, &gl->gl_flags)) demote = 0; - else if (test_bit(GLF_PREFETCH, &gl->gl_flags)) - demote = time_after_eq(jiffies, gl->gl_stamp + - gfs2_tune_get(sdp, gt_prefetch_secs) * HZ); else if (glops->go_demote_ok) demote = glops->go_demote_ok(gl); @@ -1845,7 +1677,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp) atomic_inc(&sdp->sd_reclaimed); if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED); gfs2_glmutex_unlock(gl); @@ -1909,7 +1741,7 @@ static void scan_glock(struct gfs2_glock *gl) return; if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) goto out_schedule; gfs2_glmutex_unlock(gl); @@ -1958,7 +1790,7 @@ static void clear_glock(struct gfs2_glock *gl) } if (gfs2_glmutex_trylock(gl)) { - if (queue_empty(gl, &gl->gl_holders) && + if (list_empty(&gl->gl_holders) && gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED); gfs2_glmutex_unlock(gl); @@ -2000,7 +1832,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) t = jiffies; } + down_write(&gfs2_umount_flush_sem); invalidate_inodes(sdp->sd_vfs); + up_write(&gfs2_umount_flush_sem); msleep(10); } } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index fb39108fc05..f50e40ceca4 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -20,7 +20,6 @@ #define LM_FLAG_ANY 0x00000008 #define LM_FLAG_PRIORITY 0x00000010 */ -#define GL_LOCAL_EXCL 0x00000020 #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 @@ -83,17 +82,11 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh); void gfs2_holder_uninit(struct gfs2_holder *gh); - -void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags); -void gfs2_glock_drop_th(struct gfs2_glock *gl); - int gfs2_glock_nq(struct gfs2_holder *gh); int gfs2_glock_poll(struct gfs2_holder *gh); int gfs2_glock_wait(struct gfs2_holder *gh); void gfs2_glock_dq(struct gfs2_holder *gh); -int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time); - void gfs2_glock_dq_uninit(struct gfs2_holder *gh); int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, @@ -103,10 +96,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, int flags); - /** * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock * @gl: the glock diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index b068d10bcb6..46af5535551 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -117,12 +116,14 @@ static void gfs2_pte_inval(struct gfs2_glock *gl) static void meta_go_sync(struct gfs2_glock *gl) { + if (gl->gl_state != LM_ST_EXCLUSIVE) + return; + if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { gfs2_log_flush(gl->gl_sbd, gl); gfs2_meta_sync(gl); gfs2_ail_empty_gl(gl); } - } /** @@ -142,6 +143,37 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) } /** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * + */ + +static void inode_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + + if (ip && !S_ISREG(ip->i_inode.i_mode)) + ip = NULL; + + if (test_bit(GLF_DIRTY, &gl->gl_flags)) { + gfs2_log_flush(gl->gl_sbd, gl); + if (ip) + filemap_fdatawrite(ip->i_inode.i_mapping); + gfs2_meta_sync(gl); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + int error = filemap_fdatawait(mapping); + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else if (error) + set_bit(AS_EIO, &mapping->flags); + } + clear_bit(GLF_DIRTY, &gl->gl_flags); + gfs2_ail_empty_gl(gl); + } +} + +/** * inode_go_xmote_th - promote/demote a glock * @gl: the glock * @state: the requested state @@ -149,12 +181,12 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) * */ -static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state, - int flags) +static void inode_go_xmote_th(struct gfs2_glock *gl) { if (gl->gl_state != LM_ST_UNLOCKED) gfs2_pte_inval(gl); - gfs2_glock_xmote_th(gl, state, flags); + if (gl->gl_state == LM_ST_EXCLUSIVE) + inode_go_sync(gl); } /** @@ -189,38 +221,8 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl) static void inode_go_drop_th(struct gfs2_glock *gl) { gfs2_pte_inval(gl); - gfs2_glock_drop_th(gl); -} - -/** - * inode_go_sync - Sync the dirty data and/or metadata for an inode glock - * @gl: the glock protecting the inode - * - */ - -static void inode_go_sync(struct gfs2_glock *gl) -{ - struct gfs2_inode *ip = gl->gl_object; - - if (ip && !S_ISREG(ip->i_inode.i_mode)) - ip = NULL; - - if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - if (ip) - filemap_fdatawrite(ip->i_inode.i_mapping); - gfs2_meta_sync(gl); - if (ip) { - struct address_space *mapping = ip->i_inode.i_mapping; - int error = filemap_fdatawait(mapping); - if (error == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else if (error) - set_bit(AS_EIO, &mapping->flags); - } - clear_bit(GLF_DIRTY, &gl->gl_flags); - gfs2_ail_empty_gl(gl); - } + if (gl->gl_state == LM_ST_EXCLUSIVE) + inode_go_sync(gl); } /** @@ -295,7 +297,7 @@ static int inode_go_lock(struct gfs2_holder *gh) if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_flags & GL_LOCAL_EXCL)) + (gh->gh_state == LM_ST_EXCLUSIVE)) error = gfs2_truncatei_resume(ip); return error; @@ -319,39 +321,6 @@ static void inode_go_unlock(struct gfs2_holder *gh) } /** - * inode_greedy - - * @gl: the glock - * - */ - -static void inode_greedy(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = gl->gl_object; - unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum); - unsigned int max = gfs2_tune_get(sdp, gt_greedy_max); - unsigned int new_time; - - spin_lock(&ip->i_spin); - - if (time_after(ip->i_last_pfault + quantum, jiffies)) { - new_time = ip->i_greedy + quantum; - if (new_time > max) - new_time = max; - } else { - new_time = ip->i_greedy - quantum; - if (!new_time || new_time > max) - new_time = 1; - } - - ip->i_greedy = new_time; - - spin_unlock(&ip->i_spin); - - iput(&ip->i_inode); -} - -/** * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock * @gl: the glock * @@ -398,8 +367,7 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) * */ -static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, - int flags) +static void trans_go_xmote_th(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; @@ -408,8 +376,6 @@ static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } - - gfs2_glock_xmote_th(gl, state, flags); } /** @@ -461,8 +427,6 @@ static void trans_go_drop_th(struct gfs2_glock *gl) gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } - - gfs2_glock_drop_th(gl); } /** @@ -478,8 +442,8 @@ static int quota_go_demote_ok(struct gfs2_glock *gl) } const struct gfs2_glock_operations gfs2_meta_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, + .go_xmote_th = meta_go_sync, + .go_drop_th = meta_go_sync, .go_type = LM_TYPE_META, }; @@ -487,19 +451,14 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_xmote_th = inode_go_xmote_th, .go_xmote_bh = inode_go_xmote_bh, .go_drop_th = inode_go_drop_th, - .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, .go_unlock = inode_go_unlock, - .go_greedy = inode_greedy, .go_type = LM_TYPE_INODE, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, - .go_sync = meta_go_sync, .go_inval = meta_go_inval, .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, @@ -515,33 +474,23 @@ const struct gfs2_glock_operations gfs2_trans_glops = { }; const struct gfs2_glock_operations gfs2_iopen_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_IOPEN, }; const struct gfs2_glock_operations gfs2_flock_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_FLOCK, }; const struct gfs2_glock_operations gfs2_nondisk_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_NONDISK, }; const struct gfs2_glock_operations gfs2_quota_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_demote_ok = quota_go_demote_ok, .go_type = LM_TYPE_QUOTA, }; const struct gfs2_glock_operations gfs2_journal_glops = { - .go_xmote_th = gfs2_glock_xmote_th, - .go_drop_th = gfs2_glock_drop_th, .go_type = LM_TYPE_JOURNAL, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 734421edae8..12c80fd28db 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -101,17 +101,14 @@ struct gfs2_bufdata { }; struct gfs2_glock_operations { - void (*go_xmote_th) (struct gfs2_glock *gl, unsigned int state, int flags); + void (*go_xmote_th) (struct gfs2_glock *gl); void (*go_xmote_bh) (struct gfs2_glock *gl); void (*go_drop_th) (struct gfs2_glock *gl); void (*go_drop_bh) (struct gfs2_glock *gl); - void (*go_sync) (struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - void (*go_callback) (struct gfs2_glock *gl, unsigned int state); - void (*go_greedy) (struct gfs2_glock *gl); const int go_type; }; @@ -120,7 +117,6 @@ enum { HIF_MUTEX = 0, HIF_PROMOTE = 1, HIF_DEMOTE = 2, - HIF_GREEDY = 3, /* States */ HIF_ALLOCED = 4, @@ -128,6 +124,7 @@ enum { HIF_HOLDER = 6, HIF_FIRST = 7, HIF_ABORTED = 9, + HIF_WAIT = 10, }; struct gfs2_holder { @@ -140,17 +137,14 @@ struct gfs2_holder { int gh_error; unsigned long gh_iflags; - struct completion gh_wait; unsigned long gh_ip; }; enum { GLF_LOCK = 1, GLF_STICKY = 2, - GLF_PREFETCH = 3, GLF_DIRTY = 5, GLF_SKIP_WAITERS2 = 6, - GLF_GREEDY = 7, }; struct gfs2_glock { @@ -167,7 +161,7 @@ struct gfs2_glock { unsigned long gl_ip; struct list_head gl_holders; struct list_head gl_waiters1; /* HIF_MUTEX */ - struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ + struct list_head gl_waiters2; /* HIF_DEMOTE */ struct list_head gl_waiters3; /* HIF_PROMOTE */ const struct gfs2_glock_operations *gl_ops; @@ -236,7 +230,6 @@ struct gfs2_inode { spinlock_t i_spin; struct rw_semaphore i_rw_mutex; - unsigned int i_greedy; unsigned long i_last_pfault; struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; @@ -418,17 +411,12 @@ struct gfs2_tune { unsigned int gt_atime_quantum; /* Min secs between atime updates */ unsigned int gt_new_files_jdata; unsigned int gt_new_files_directio; - unsigned int gt_max_atomic_write; /* Split big writes into this size */ unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ unsigned int gt_lockdump_size; unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ unsigned int gt_entries_per_readdir; - unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */ - unsigned int gt_greedy_default; - unsigned int gt_greedy_quantum; - unsigned int gt_greedy_max; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index d122074c45e..0d6831a4056 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -287,10 +287,8 @@ out: * * Returns: errno */ - int gfs2_change_nlink(struct gfs2_inode *ip, int diff) { - struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info; struct buffer_head *dibh; u32 nlink; int error; @@ -315,42 +313,34 @@ int gfs2_change_nlink(struct gfs2_inode *ip, int diff) else drop_nlink(&ip->i_inode); - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); mark_inode_dirty(&ip->i_inode); - if (ip->i_inode.i_nlink == 0) { - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh, rg_gh; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - error = -EIO; - rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); - if (!rgd) - goto out_norgrp; - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); - if (error) - goto out_norgrp; - + if (ip->i_inode.i_nlink == 0) gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ - gfs2_glock_dq_uninit(&rg_gh); -out_norgrp: - gfs2_glock_dq_uninit(&ri_gh); - } -out: + return error; } struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) { struct qstr qstr; + struct inode *inode; gfs2_str2qstr(&qstr, name); - return gfs2_lookupi(dip, &qstr, 1, NULL); + inode = gfs2_lookupi(dip, &qstr, 1, NULL); + /* gfs2_lookupi has inconsistent callers: vfs + * related routines expect NULL for no entry found, + * gfs2_lookup_simple callers expect ENOENT + * and do not check for NULL. + */ + if (inode == NULL) + return ERR_PTR(-ENOENT); + else + return inode; } @@ -361,8 +351,10 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) * @is_root: If 1, ignore the caller's permissions * @i_gh: An uninitialized holder for the new inode glock * - * There will always be a vnode (Linux VFS inode) for the d_gh inode unless - * @is_root is true. + * This can be called via the VFS filldir function when NFS is doing + * a readdirplus and the inode which its intending to stat isn't + * already in cache. In this case we must not take the directory glock + * again, since the readdir call will have already taken that lock. * * Returns: errno */ @@ -375,8 +367,9 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct gfs2_holder d_gh; struct gfs2_inum_host inum; unsigned int type; - int error = 0; + int error; struct inode *inode = NULL; + int unlock = 0; if (!name->len || name->len > GFS2_FNAMESIZE) return ERR_PTR(-ENAMETOOLONG); @@ -388,9 +381,12 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, return dir; } - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - if (error) - return ERR_PTR(error); + if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + unlock = 1; + } if (!is_root) { error = permission(dir, MAY_EXEC, NULL); @@ -405,10 +401,11 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, inode = gfs2_inode_lookup(sb, &inum, type); out: - gfs2_glock_dq_uninit(&d_gh); + if (unlock) + gfs2_glock_dq_uninit(&d_gh); if (error == -ENOENT) return NULL; - return inode; + return inode ? inode : ERR_PTR(error); } static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c index effe4a337c1..cfcc39b86a5 100644 --- a/fs/gfs2/lm.c +++ b/fs/gfs2/lm.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -104,15 +103,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) vprintk(fmt, args); va_end(args); - fs_err(sdp, "about to withdraw from the cluster\n"); + fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); - - fs_err(sdp, "waiting for outstanding I/O\n"); - - /* FIXME: suspend dm device so oustanding bio's complete - and all further io requests fail */ - fs_err(sdp, "telling LM to withdraw\n"); gfs2_withdraw_lockproto(&sdp->sd_lockstruct); fs_err(sdp, "withdrawn\n"); diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index 33af707a4d3..a87c7bf3c56 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -36,7 +36,7 @@ #define GDLM_STRNAME_BYTES 24 #define GDLM_LVB_SIZE 32 -#define GDLM_DROP_COUNT 50000 +#define GDLM_DROP_COUNT 200000 #define GDLM_DROP_PERIOD 60 #define GDLM_NAME_LEN 128 diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c index 2194b1d5b5e..a0e7eda643e 100644 --- a/fs/gfs2/locking/dlm/main.c +++ b/fs/gfs2/locking/dlm/main.c @@ -11,9 +11,6 @@ #include "lock_dlm.h" -extern int gdlm_drop_count; -extern int gdlm_drop_period; - extern struct lm_lockops gdlm_ops; static int __init init_lock_dlm(void) @@ -40,9 +37,6 @@ static int __init init_lock_dlm(void) return error; } - gdlm_drop_count = GDLM_DROP_COUNT; - gdlm_drop_period = GDLM_DROP_PERIOD; - printk(KERN_INFO "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); return 0; diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index cdd1694e889..1d8faa3da8a 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -9,8 +9,6 @@ #include "lock_dlm.h" -int gdlm_drop_count; -int gdlm_drop_period; const struct lm_lockops gdlm_ops; @@ -24,8 +22,8 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, if (!ls) return NULL; - ls->drop_locks_count = gdlm_drop_count; - ls->drop_locks_period = gdlm_drop_period; + ls->drop_locks_count = GDLM_DROP_COUNT; + ls->drop_locks_period = GDLM_DROP_PERIOD; ls->fscb = cb; ls->sdp = sdp; ls->fsflags = flags; diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c index 3799f19b282..1dd4215b83d 100644 --- a/fs/gfs2/locking/dlm/plock.c +++ b/fs/gfs2/locking/dlm/plock.c @@ -264,7 +264,7 @@ static unsigned int dev_poll(struct file *file, poll_table *wait) return 0; } -static struct file_operations dev_fops = { +static const struct file_operations dev_fops = { .read = dev_read, .write = dev_write, .poll = dev_poll, diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index 29ae06f9494..4746b884662 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -116,6 +116,17 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) return sprintf(buf, "%d\n", ls->recover_jid_status); } +static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->drop_locks_count); +} + +static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ls->drop_locks_count = simple_strtol(buf, NULL, 0); + return len; +} + struct gdlm_attr { struct attribute attr; ssize_t (*show)(struct gdlm_ls *, char *); @@ -135,6 +146,7 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0644, recover_show, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store); static struct attribute *gdlm_attrs[] = { &gdlm_attr_proto_name.attr, @@ -147,6 +159,7 @@ static struct attribute *gdlm_attrs[] = { &gdlm_attr_recover.attr, &gdlm_attr_recover_done.attr, &gdlm_attr_recover_status.attr, + &gdlm_attr_drop_count.attr, NULL, }; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4d7f94d8c7b..16bb4b4561a 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -69,13 +69,16 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); struct gfs2_trans *tr; - if (!list_empty(&bd->bd_list_tr)) + gfs2_log_lock(sdp); + if (!list_empty(&bd->bd_list_tr)) { + gfs2_log_unlock(sdp); return; - + } tr = current->journal_info; tr->tr_touched = 1; tr->tr_num_buf++; list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_log_unlock(sdp); if (!list_empty(&le->le_list)) return; @@ -84,7 +87,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) gfs2_meta_check(sdp, bd->bd_bh); gfs2_pin(sdp, bd->bd_bh); - gfs2_log_lock(sdp); sdp->sd_log_num_buf++; list_add(&le->le_list, &sdp->sd_log_le_buf); @@ -98,11 +100,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) struct list_head *head = &tr->tr_list_buf; struct gfs2_bufdata *bd; + gfs2_log_lock(sdp); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); list_del_init(&bd->bd_list_tr); tr->tr_num_buf--; } + gfs2_log_unlock(sdp); gfs2_assert_warn(sdp, !tr->tr_num_buf); } @@ -462,13 +466,17 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); + gfs2_log_lock(sdp); tr->tr_touched = 1; if (list_empty(&bd->bd_list_tr) && (ip->i_di.di_flags & GFS2_DIF_JDATA)) { tr->tr_num_buf++; list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_log_unlock(sdp); gfs2_pin(sdp, bd->bd_bh); tr->tr_num_buf_new++; + } else { + gfs2_log_unlock(sdp); } gfs2_trans_add_gl(bd->bd_gl); gfs2_log_lock(sdp); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 7c1a9e22a52..6e8a59809ab 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0e34d991897..e62d4f620c5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -282,8 +282,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, return; } - bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), - memset(bd, 0, sizeof(struct gfs2_bufdata)); + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), bd->bd_bh = bh; bd->bd_gl = gl; diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index ef3092e2960..32caecd2030 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c index f2495f1e21a..d9ecfd23a49 100644 --- a/fs/gfs2/ondisk.c +++ b/fs/gfs2/ondisk.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index d8d69a72a10..56e33590b65 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -16,6 +16,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/fs.h> +#include <linux/writeback.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> @@ -157,6 +158,32 @@ out_ignore: } /** + * gfs2_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * For journaled files and/or ordered writes this just falls back to the + * kernel's default writepages path for now. We will probably want to change + * that eventually (i.e. when we look at allocate on flush). + * + * For the data=writeback case though we can already ignore buffer heads + * and write whole extents at once. This is a big reduction in the + * number of I/O requests we send and the bmap calls we make in this case. + */ +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip)) + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + return generic_writepages(mapping, wbc); +} + +/** * stuffed_readpage - Fill in a Linux page with stuffed file data * @ip: the inode * @page: the page @@ -256,7 +283,7 @@ out_unlock: * the page lock and the glock) and return having done no I/O. Its * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. - * 2. We have to handle stuffed files here too. + * 2. We don't handle stuffed files here we let readpage do the honours. * 3. mpage_readpages() does most of the heavy lifting in the common case. * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as @@ -269,8 +296,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - unsigned page_idx; - int ret; + int ret = 0; int do_unlock = 0; if (likely(file != &gfs2_internal_file_sentinel)) { @@ -289,29 +315,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, goto out_unlock; } skip_lock: - if (gfs2_is_stuffed(ip)) { - struct pagevec lru_pvec; - pagevec_init(&lru_pvec, 0); - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache(page, mapping, - page->index, GFP_KERNEL)) { - ret = stuffed_readpage(ip, page); - unlock_page(page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); - } - } - pagevec_lru_add(&lru_pvec); - ret = 0; - } else { - /* What we really want to do .... */ + if (!gfs2_is_stuffed(ip)) ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); - } if (do_unlock) { gfs2_glock_dq_m(1, &gh); @@ -356,8 +361,10 @@ static int gfs2_prepare_write(struct file *file, struct page *page, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh); error = gfs2_glock_nq_atime(&ip->i_gh); if (unlikely(error)) { - if (error == GLR_TRYFAILED) + if (error == GLR_TRYFAILED) { + unlock_page(page); error = AOP_TRUNCATED_PAGE; + } goto out_uninit; } @@ -594,6 +601,36 @@ static void gfs2_invalidatepage(struct page *page, unsigned long offset) return; } +/** + * gfs2_ok_for_dio - check that dio is valid on this file + * @ip: The inode + * @rw: READ or WRITE + * @offset: The offset at which we are reading or writing + * + * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) + * 1 (to accept the i/o request) + */ +static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) +{ + /* + * Should we return an error here? I can't see that O_DIRECT for + * a journaled file makes any sense. For now we'll silently fall + * back to buffered I/O, likewise we do the same for stuffed + * files since they are (a) small and (b) unaligned. + */ + if (gfs2_is_jdata(ip)) + return 0; + + if (gfs2_is_stuffed(ip)) + return 0; + + if (offset > i_size_read(&ip->i_inode)) + return 0; + return 1; +} + + + static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -604,42 +641,28 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, struct gfs2_holder gh; int rv; - if (rw == READ) - mutex_lock(&inode->i_mutex); /* - * Shared lock, even if its a write, since we do no allocation - * on this path. All we need change is atime. + * Deferred lock, even if its a write, since we do no allocation + * on this path. All we need change is atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately have the option of only flushing a range like + * the VFS does. */ - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); rv = gfs2_glock_nq_atime(&gh); if (rv) - goto out; - - if (offset > i_size_read(inode)) - goto out; - - /* - * Should we return an error here? I can't see that O_DIRECT for - * a journaled file makes any sense. For now we'll silently fall - * back to buffered I/O, likewise we do the same for stuffed - * files since they are (a) small and (b) unaligned. - */ - if (gfs2_is_jdata(ip)) - goto out; - - if (gfs2_is_stuffed(ip)) - goto out; - - rv = blockdev_direct_IO_own_locking(rw, iocb, inode, - inode->i_sb->s_bdev, - iov, offset, nr_segs, - gfs2_get_block_direct, NULL); + return rv; + rv = gfs2_ok_for_dio(ip, rw, offset); + if (rv != 1) + goto out; /* dio not valid, fall back to buffered i/o */ + + rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + gfs2_get_block_direct, NULL); out: gfs2_glock_dq_m(1, &gh); gfs2_holder_uninit(&gh); - if (rw == READ) - mutex_unlock(&inode->i_mutex); - return rv; } @@ -763,6 +786,7 @@ out: const struct address_space_operations gfs2_file_aops = { .writepage = gfs2_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .sync_page = block_sync_page, diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index d355899585d..c6bac6b6942 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -46,6 +45,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) struct gfs2_inum_host inum; unsigned int type; int error; + int had_lock=0; if (inode && is_bad_inode(inode)) goto invalid; @@ -53,9 +53,12 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) if (sdp->sd_args.ar_localcaching) goto valid; - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - if (error) - goto fail; + had_lock = gfs2_glock_is_locked_by_me(dip->i_gl); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + } error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); switch (error) { @@ -82,13 +85,15 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) } valid_gunlock: - gfs2_glock_dq_uninit(&d_gh); + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); valid: dput(parent); return 1; invalid_gunlock: - gfs2_glock_dq_uninit(&d_gh); + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); invalid: if (inode && S_ISDIR(inode->i_mode)) { if (have_submounts(dentry)) diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index b4e7b877531..1de05b63d43 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -22,6 +21,7 @@ #include "glock.h" #include "glops.h" #include "inode.h" +#include "ops_dentry.h" #include "ops_export.h" #include "rgrp.h" #include "util.h" @@ -112,13 +112,12 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(void *opaque, const char *name, unsigned int length, - u64 offset, struct gfs2_inum_host *inum, - unsigned int type) +static int get_name_filldir(void *opaque, const char *name, int length, + loff_t offset, u64 inum, unsigned int type) { - struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque; + struct get_name_filldir *gnfd = opaque; - if (!gfs2_inum_equal(inum, &gnfd->inum)) + if (inum != gnfd->inum.no_addr) return 0; memcpy(gnfd->name, name, length); @@ -189,6 +188,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child) return ERR_PTR(-ENOMEM); } + dentry->d_op = &gfs2_dops; return dentry; } @@ -215,8 +215,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) } error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL, - &i_gh); + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return ERR_PTR(error); @@ -269,6 +268,7 @@ out_inode: return ERR_PTR(-ENOMEM); } + dentry->d_op = &gfs2_dops; return dentry; fail_rgd: diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index faa07e4b97d..b50180e2277 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -43,15 +42,6 @@ #include "util.h" #include "eaops.h" -/* For regular, non-NFS */ -struct filldir_reg { - struct gfs2_sbd *fdr_sbd; - int fdr_prefetch; - - filldir_t fdr_filldir; - void *fdr_opaque; -}; - /* * Most fields left uninitialised to catch anybody who tries to * use them. f_flags set to prevent file_accessed() from touching @@ -128,41 +118,6 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) } /** - * filldir_func - Report a directory entry to the caller of gfs2_dir_read() - * @opaque: opaque data used by the function - * @name: the name of the directory entry - * @length: the length of the name - * @offset: the entry's offset in the directory - * @inum: the inode number the entry points to - * @type: the type of inode the entry points to - * - * Returns: 0 on success, 1 if buffer full - */ - -static int filldir_func(void *opaque, const char *name, unsigned int length, - u64 offset, struct gfs2_inum_host *inum, - unsigned int type) -{ - struct filldir_reg *fdr = (struct filldir_reg *)opaque; - struct gfs2_sbd *sdp = fdr->fdr_sbd; - int error; - - error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset, - inum->no_addr, type); - if (error) - return 1; - - if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) { - gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY); - gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops, - LM_ST_SHARED, LM_FLAG_TRY); - } - - return 0; -} - -/** * gfs2_readdir - Read directory entries from a directory * @file: The directory to read from * @dirent: Buffer for dirents @@ -175,16 +130,10 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) { struct inode *dir = file->f_mapping->host; struct gfs2_inode *dip = GFS2_I(dir); - struct filldir_reg fdr; struct gfs2_holder d_gh; u64 offset = file->f_pos; int error; - fdr.fdr_sbd = GFS2_SB(dir); - fdr.fdr_prefetch = 1; - fdr.fdr_filldir = filldir; - fdr.fdr_opaque = dirent; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); error = gfs2_glock_nq_atime(&d_gh); if (error) { @@ -192,7 +141,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) return error; } - error = gfs2_dir_read(dir, &offset, &fdr, filldir_func); + error = gfs2_dir_read(dir, &offset, dirent, filldir); gfs2_glock_dq_uninit(&d_gh); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 636dda4c7d3..d85f6e05cb9 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -264,13 +263,23 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[2]; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; int error; + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq_m(3, ghs); if (error) goto out; @@ -291,10 +300,12 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) out_end_trans: gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(2, ghs); + gfs2_glock_dq_m(3, ghs); out: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -449,13 +460,22 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[2]; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + error = gfs2_glock_nq_m(3, ghs); if (error) goto out; @@ -483,10 +503,12 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(2, ghs); + gfs2_glock_dq_m(3, ghs); out: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -547,7 +569,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[4], r_gh; + struct gfs2_holder ghs[5], r_gh; + struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; int alloc_required; @@ -587,6 +610,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (nip) { gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } error = gfs2_glock_nq_m(num_gh, ghs); @@ -684,12 +714,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + al->al_rgd->rd_ri.ri_length + 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA, 0); + RES_STATFS + RES_QUOTA + 4, 0); if (error) goto out_ipreserv; } else { error = gfs2_trans_begin(sdp, 4 * RES_DINODE + - 5 * RES_LEAF, 0); + 5 * RES_LEAF + 4, 0); if (error) goto out_gunlock; } @@ -728,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out_end_trans; - ip->i_inode.i_ctime.tv_sec = get_seconds(); + ip->i_inode.i_ctime = CURRENT_TIME_SEC; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1018,7 +1048,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, } generic_fillattr(inode, stat); - if (unlock); + if (unlock) gfs2_glock_dq_uninit(&gh); return 0; @@ -1084,7 +1114,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); } -struct inode_operations gfs2_file_iops = { +const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, @@ -1094,7 +1124,7 @@ struct inode_operations gfs2_file_iops = { .removexattr = gfs2_removexattr, }; -struct inode_operations gfs2_dev_iops = { +const struct inode_operations gfs2_dev_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, @@ -1104,7 +1134,7 @@ struct inode_operations gfs2_dev_iops = { .removexattr = gfs2_removexattr, }; -struct inode_operations gfs2_dir_iops = { +const struct inode_operations gfs2_dir_iops = { .create = gfs2_create, .lookup = gfs2_lookup, .link = gfs2_link, @@ -1123,7 +1153,7 @@ struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, }; -struct inode_operations gfs2_symlink_iops = { +const struct inode_operations gfs2_symlink_iops = { .readlink = gfs2_readlink, .follow_link = gfs2_follow_link, .permission = gfs2_permission, diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h index b15acb4fd34..34f0caac1a0 100644 --- a/fs/gfs2/ops_inode.h +++ b/fs/gfs2/ops_inode.h @@ -12,9 +12,9 @@ #include <linux/fs.h> -extern struct inode_operations gfs2_file_iops; -extern struct inode_operations gfs2_dir_iops; -extern struct inode_operations gfs2_symlink_iops; -extern struct inode_operations gfs2_dev_iops; +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct inode_operations gfs2_dev_iops; #endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 7685b46f934..b89999d3a76 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; int error; + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return; + for (;;) { error = gfs2_freeze_fs(sdp); if (!error) @@ -426,6 +429,12 @@ static void gfs2_delete_inode(struct inode *inode) } error = gfs2_dinode_dealloc(ip); + /* + * Must do this before unlock to avoid trying to write back + * potentially dirty data now that inode no longer exists + * on disk. + */ + truncate_inode_pages(&inode->i_data, 0); out_unlock: gfs2_glock_dq(&ip->i_iopen_gh); @@ -443,14 +452,12 @@ out: static struct inode *gfs2_alloc_inode(struct super_block *sb) { - struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip; ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); if (ip) { ip->i_flags = 0; ip->i_gl = NULL; - ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default); ip->i_last_pfault = jiffies; } return &ip->i_inode; @@ -461,7 +468,7 @@ static void gfs2_destroy_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, inode); } -struct super_operations gfs2_super_ops = { +const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .destroy_inode = gfs2_destroy_inode, .write_inode = gfs2_write_inode, diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h index 9de73f042f7..442a274c627 100644 --- a/fs/gfs2/ops_super.h +++ b/fs/gfs2/ops_super.h @@ -12,6 +12,6 @@ #include <linux/fs.h> -extern struct super_operations gfs2_super_ops; +extern const struct super_operations gfs2_super_ops; #endif /* __OPS_SUPER_DOT_H__ */ diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c index 45a5f11fc39..aa0dbd2aac1 100644 --- a/fs/gfs2/ops_vm.c +++ b/fs/gfs2/ops_vm.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -28,34 +27,13 @@ #include "trans.h" #include "util.h" -static void pfault_be_greedy(struct gfs2_inode *ip) -{ - unsigned int time; - - spin_lock(&ip->i_spin); - time = ip->i_greedy; - ip->i_last_pfault = jiffies; - spin_unlock(&ip->i_spin); - - igrab(&ip->i_inode); - if (gfs2_glock_be_greedy(ip->i_gl, time)) - iput(&ip->i_inode); -} - static struct page *gfs2_private_nopage(struct vm_area_struct *area, unsigned long address, int *type) { struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host); - struct page *result; set_bit(GIF_PAGED, &ip->i_flags); - - result = filemap_nopage(area, address, type); - - if (result && result != NOPAGE_OOM) - pfault_be_greedy(ip); - - return result; + return filemap_nopage(area, address, type); } static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) @@ -167,7 +145,6 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, set_page_dirty(result); } - pfault_be_greedy(ip); out: gfs2_glock_dq_uninit(&i_gh); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index d0c806b85c8..8bc182c7e2e 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ff0846528d5..8d9c08b5c4b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 43a24f2e590..70f424fcf1c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -71,17 +71,12 @@ void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_atime_quantum = 3600; gt->gt_new_files_jdata = 0; gt->gt_new_files_directio = 0; - gt->gt_max_atomic_write = 4 << 20; gt->gt_max_readahead = 1 << 18; gt->gt_lockdump_size = 131072; gt->gt_stall_secs = 600; gt->gt_complain_secs = 10; gt->gt_reclaim_limit = 5000; gt->gt_entries_per_readdir = 32; - gt->gt_prefetch_secs = 10; - gt->gt_greedy_default = HZ / 10; - gt->gt_greedy_quantum = HZ / 40; - gt->gt_greedy_max = HZ / 4; gt->gt_statfs_quantum = 30; gt->gt_statfs_slow = 0; } @@ -359,8 +354,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) mutex_lock(&sdp->sd_jindex_mutex); for (;;) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - GL_LOCAL_EXCL, ji_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); if (error) break; @@ -529,8 +523,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - GL_LOCAL_EXCL, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); if (error) return error; @@ -583,9 +576,8 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp); gfs2_statfs_sync(sdp); - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - GL_LOCAL_EXCL | GL_NOCACHE, - &t_gh); + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return error; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 983eaf1e06b..d01f9f0fda2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -436,17 +436,12 @@ TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(reclaim_limit, 0); -TUNE_ATTR(prefetch_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(new_files_directio, 0); TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(quota_cache_secs, 1); -TUNE_ATTR(max_atomic_write, 1); TUNE_ATTR(stall_secs, 1); -TUNE_ATTR(greedy_default, 1); -TUNE_ATTR(greedy_quantum, 1); -TUNE_ATTR(greedy_max, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_DAEMON(scand_secs, scand_process); TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); @@ -465,15 +460,10 @@ static struct attribute *tune_attrs[] = { &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_reclaim_limit.attr, - &tune_attr_prefetch_secs.attr, &tune_attr_statfs_slow.attr, &tune_attr_quota_simul_sync.attr, &tune_attr_quota_cache_secs.attr, - &tune_attr_max_atomic_write.attr, &tune_attr_stall_secs.attr, - &tune_attr_greedy_default.attr, - &tune_attr_greedy_quantum.attr, - &tune_attr_greedy_max.attr, &tune_attr_statfs_quantum.attr, &tune_attr_scand_secs.attr, &tune_attr_recoverd_secs.attr, diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index e5707a9f78c..601eaa1b9ed 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -7,7 +7,6 @@ * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index e2e0358da33..7c69b98a2e4 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -320,7 +320,7 @@ const struct file_operations hfs_dir_operations = { .release = hfs_dir_release, }; -struct inode_operations hfs_dir_inode_operations = { +const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, .unlink = hfs_unlink, diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h index 88099ab1a18..1445e3a56ed 100644 --- a/fs/hfs/hfs.h +++ b/fs/hfs/hfs.h @@ -83,8 +83,6 @@ /*======== HFS structures as they appear on the disk ========*/ -#define __packed __attribute__ ((packed)) - /* Pascal-style string of up to 31 characters */ struct hfs_name { u8 len; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 735332dfd1b..147374b6f67 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -170,7 +170,7 @@ extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qst /* dir.c */ extern const struct file_operations hfs_dir_operations; -extern struct inode_operations hfs_dir_inode_operations; +extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 5cb7f8fee8d..fafcba59387 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -18,7 +18,7 @@ #include "btree.h" static const struct file_operations hfs_file_operations; -static struct inode_operations hfs_file_inode_operations; +static const struct inode_operations hfs_file_inode_operations; /*================ Variable-like macros ================*/ @@ -612,7 +612,7 @@ static const struct file_operations hfs_file_operations = { .release = hfs_file_release, }; -static struct inode_operations hfs_file_inode_operations = { +static const struct inode_operations hfs_file_inode_operations = { .lookup = hfs_file_lookup, .truncate = hfs_file_truncate, .setattr = hfs_inode_setattr, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index a3698796600..623f509f1d4 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -154,7 +154,7 @@ static void hfs_destroy_inode(struct inode *inode) kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } -static struct super_operations hfs_super_operations = { +static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, .destroy_inode = hfs_destroy_inode, .write_inode = hfs_write_inode, diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index f2d7c49ce75..ba117c445e7 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -8,7 +8,6 @@ * Handling of catalog records */ -#include <linux/sched.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e886ac8460d..80b5682a227 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -10,7 +10,6 @@ #include <linux/errno.h> #include <linux/fs.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/random.h> @@ -471,7 +470,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, return res; } -struct inode_operations hfsplus_dir_inode_operations = { +const struct inode_operations hfsplus_dir_inode_operations = { .lookup = hfsplus_lookup, .create = hfsplus_create, .link = hfsplus_link, diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 49205531a50..fe99fe8db61 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -15,8 +15,6 @@ #include <linux/types.h> -#define __packed __attribute__ ((packed)) - /* Some constants */ #define HFSPLUS_SECTOR_SIZE 512 #define HFSPLUS_SECTOR_SHIFT 9 diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 75e8c4d8aac..642012ac337 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -268,10 +268,10 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -extern struct inode_operations hfsplus_dir_inode_operations; +extern const struct inode_operations hfsplus_dir_inode_operations; extern struct file_operations hfsplus_dir_operations; -static struct inode_operations hfsplus_file_inode_operations = { +static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, .truncate = hfsplus_file_truncate, .permission = hfsplus_permission, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 0f513c6bf84..1a97f929344 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,7 +11,6 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/fs.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -260,7 +259,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations hfsplus_sops = { +static const struct super_operations hfsplus_sops = { .alloc_inode = hfsplus_alloc_inode, .destroy_inode = hfsplus_destroy_inode, .read_inode = hfsplus_read_inode, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 69a376f35a6..e965eb11d76 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -52,8 +52,8 @@ static int append = 0; #define HOSTFS_SUPER_MAGIC 0x00c0ffee -static struct inode_operations hostfs_iops; -static struct inode_operations hostfs_dir_iops; +static const struct inode_operations hostfs_iops; +static const struct inode_operations hostfs_dir_iops; static const struct address_space_operations hostfs_link_aops; #ifndef MODULE @@ -309,7 +309,7 @@ static void hostfs_read_inode(struct inode *inode) read_inode(inode); } -static struct super_operations hostfs_sbops = { +static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .drop_inode = generic_delete_inode, .delete_inode = hostfs_delete_inode, @@ -880,7 +880,7 @@ int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return(0); } -static struct inode_operations hostfs_iops = { +static const struct inode_operations hostfs_iops = { .create = hostfs_create, .link = hostfs_link, .unlink = hostfs_unlink, @@ -894,7 +894,7 @@ static struct inode_operations hostfs_iops = { .getattr = hostfs_getattr, }; -static struct inode_operations hostfs_dir_iops = { +static const struct inode_operations hostfs_dir_iops = { .create = hostfs_create, .lookup = hostfs_lookup, .link = hostfs_link, diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index fb4c8915010..b4eafc0f1e5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -132,7 +132,7 @@ const struct file_operations hpfs_file_ops = .sendfile = generic_file_sendfile, }; -struct inode_operations hpfs_file_iops = +const struct inode_operations hpfs_file_iops = { .truncate = hpfs_truncate, .setattr = hpfs_notify_change, diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 1c07aa82d32..42ff60ccf2a 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -266,7 +266,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); int hpfs_file_fsync(struct file *, struct dentry *, int); extern const struct file_operations hpfs_file_ops; -extern struct inode_operations hpfs_file_iops; +extern const struct inode_operations hpfs_file_iops; extern const struct address_space_operations hpfs_aops; /* inode.c */ @@ -302,7 +302,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); /* namei.c */ -extern struct inode_operations hpfs_dir_iops; +extern const struct inode_operations hpfs_dir_iops; extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 2507e7393f3..9953cf9a2f1 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -659,7 +659,7 @@ end1: return err; } -struct inode_operations hpfs_dir_iops = +const struct inode_operations hpfs_dir_iops = { .create = hpfs_create, .lookup = hpfs_lookup, diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index d4abc1a1d56..e0174e33852 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -426,7 +426,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) /* Super operations */ -static struct super_operations hpfs_sops = +static const struct super_operations hpfs_sops = { .alloc_inode = hpfs_alloc_inode, .destroy_inode = hpfs_destroy_inode, diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index afd340a45da..affb7412125 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -43,7 +43,7 @@ static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) #define HPPFS_SUPER_MAGIC 0xb00000ee -static struct super_operations hppfs_sbops; +static const struct super_operations hppfs_sbops; static int is_pid(struct dentry *dentry) { @@ -212,7 +212,7 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, return(ERR_PTR(err)); } -static struct inode_operations hppfs_file_iops = { +static const struct inode_operations hppfs_file_iops = { }; static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, @@ -649,7 +649,7 @@ static void hppfs_destroy_inode(struct inode *inode) kfree(HPPFS_I(inode)); } -static struct super_operations hppfs_sbops = { +static const struct super_operations hppfs_sbops = { .alloc_inode = hppfs_alloc_inode, .destroy_inode = hppfs_destroy_inode, .read_inode = hppfs_read_inode, @@ -693,11 +693,11 @@ static void* hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) return ret; } -static struct inode_operations hppfs_dir_iops = { +static const struct inode_operations hppfs_dir_iops = { .lookup = hppfs_lookup, }; -static struct inode_operations hppfs_link_iops = { +static const struct inode_operations hppfs_link_iops = { .readlink = hppfs_readlink, .follow_link = hppfs_follow_link, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4f4cd132b57..8c718a3d413 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -33,11 +33,11 @@ /* some random number */ #define HUGETLBFS_MAGIC 0x958458f6 -static struct super_operations hugetlbfs_ops; +static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; -static struct inode_operations hugetlbfs_dir_inode_operations; -static struct inode_operations hugetlbfs_inode_operations; +static const struct inode_operations hugetlbfs_dir_inode_operations; +static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ @@ -449,10 +449,13 @@ static int hugetlbfs_symlink(struct inode *dir, } /* - * For direct-IO reads into hugetlb pages + * mark the head page dirty */ static int hugetlbfs_set_page_dirty(struct page *page) { + struct page *head = (struct page *)page_private(page); + + SetPageDirty(head); return 0; } @@ -560,7 +563,7 @@ const struct file_operations hugetlbfs_file_operations = { .get_unmapped_area = hugetlb_get_unmapped_area, }; -static struct inode_operations hugetlbfs_dir_inode_operations = { +static const struct inode_operations hugetlbfs_dir_inode_operations = { .create = hugetlbfs_create, .lookup = simple_lookup, .link = simple_link, @@ -573,11 +576,11 @@ static struct inode_operations hugetlbfs_dir_inode_operations = { .setattr = hugetlbfs_setattr, }; -static struct inode_operations hugetlbfs_inode_operations = { +static const struct inode_operations hugetlbfs_inode_operations = { .setattr = hugetlbfs_setattr, }; -static struct super_operations hugetlbfs_ops = { +static const struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .destroy_inode = hugetlbfs_destroy_inode, .statfs = hugetlbfs_statfs, diff --git a/fs/inode.c b/fs/inode.c index bf21dc6d0db..5abb097ab1b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -414,7 +414,8 @@ static void prune_icache(int nr_to_scan) __iget(inode); spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) - reap += invalidate_inode_pages(&inode->i_data); + reap += invalidate_mapping_pages(&inode->i_data, + 0, -1); iput(inode); spin_lock(&inode_lock); @@ -709,7 +710,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) + if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) __iget(inode); else /* @@ -999,7 +1000,7 @@ EXPORT_SYMBOL(remove_inode_hash); */ void generic_delete_inode(struct inode *inode) { - struct super_operations *op = inode->i_sb->s_op; + const struct super_operations *op = inode->i_sb->s_op; list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); @@ -1092,7 +1093,7 @@ EXPORT_SYMBOL_GPL(generic_drop_inode); */ static inline void iput_final(struct inode *inode) { - struct super_operations *op = inode->i_sb->s_op; + const struct super_operations *op = inode->i_sb->s_op; void (*drop)(struct inode *) = generic_drop_inode; if (op && op->drop_inode) @@ -1112,7 +1113,7 @@ static inline void iput_final(struct inode *inode) void iput(struct inode *inode) { if (inode) { - struct super_operations *op = inode->i_sb->s_op; + const struct super_operations *op = inode->i_sb->s_op; BUG_ON(inode->i_state == I_CLEAR); @@ -1160,11 +1161,9 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec now; - if (IS_RDONLY(inode)) - return; if (inode->i_flags & S_NOATIME) return; - if (inode->i_sb->s_flags & MS_NOATIME) + if (IS_NOATIME(inode)) return; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) return; @@ -1252,33 +1251,6 @@ int inode_needs_sync(struct inode *inode) EXPORT_SYMBOL(inode_needs_sync); -/* - * Quota functions that want to walk the inode lists.. - */ -#ifdef CONFIG_QUOTA - -void remove_dquot_ref(struct super_block *sb, int type, - struct list_head *tofree_head) -{ - struct inode *inode; - - if (!sb->dq_op) - return; /* nothing to do */ - spin_lock(&inode_lock); /* This lock is for inodes code */ - - /* - * We don't have to lock against quota code - test IS_QUOTAINIT is - * just for speedup... - */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) - if (!IS_NOQUOTA(inode)) - remove_inode_dquot_ref(inode, type, tofree_head); - - spin_unlock(&inode_lock); -} - -#endif - int inode_wait(void *word) { schedule(); diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 55f6da55b7c..9f2224f65a1 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -455,8 +455,16 @@ static ssize_t inotify_read(struct file *file, char __user *buf, break; kevent = inotify_dev_get_event(dev); - if (event_size + kevent->event.len > count) + if (event_size + kevent->event.len > count) { + if (ret == 0 && count > 0) { + /* + * could not get a single event because we + * didn't have enough buffer space. + */ + ret = -EINVAL; + } break; + } if (copy_to_user(buf, &kevent->event, event_size)) { ret = -EFAULT; diff --git a/fs/ioprio.c b/fs/ioprio.c index 89e8da112a7..10d2c211d18 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -60,6 +60,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) int data = IOPRIO_PRIO_DATA(ioprio); struct task_struct *p, *g; struct user_struct *user; + struct pid *pgrp; int ret; switch (class) { @@ -98,12 +99,14 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) break; case IOPRIO_WHO_PGRP: if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + pgrp = task_pgrp(current); + else + pgrp = find_pid(who); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -167,6 +170,7 @@ asmlinkage long sys_ioprio_get(int which, int who) { struct task_struct *g, *p; struct user_struct *user; + struct pid *pgrp; int ret = -ESRCH; int tmpio; @@ -182,8 +186,10 @@ asmlinkage long sys_ioprio_get(int which, int who) break; case IOPRIO_WHO_PGRP: if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + pgrp = task_pgrp(current); + else + pgrp = find_pid(who); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -191,7 +197,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 4af2548f97a..0e94c31cad9 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -24,7 +24,7 @@ const struct file_operations isofs_dir_operations = /* * directories can handle most operations... */ -struct inode_operations isofs_dir_inode_operations = +const struct inode_operations isofs_dir_inode_operations = { .lookup = isofs_lookup, }; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ea55b6c469e..64a96cdfe3a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -106,7 +106,7 @@ static int isofs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations isofs_sops = { +static const struct super_operations isofs_sops = { .alloc_inode = isofs_alloc_inode, .destroy_inode = isofs_destroy_inode, .read_inode = isofs_read_inode, diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index e6308c8b573..efe2872cd4e 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -174,7 +174,7 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de, } } -extern struct inode_operations isofs_dir_inode_operations; +extern const struct inode_operations isofs_dir_inode_operations; extern const struct file_operations isofs_dir_operations; extern const struct address_space_operations isofs_symlink_aops; extern struct export_operations isofs_export_ops; diff --git a/fs/jffs/Makefile b/fs/jffs/Makefile deleted file mode 100644 index 9c1c0bb5969..00000000000 --- a/fs/jffs/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for the linux Journalling Flash FileSystem (JFFS) routines. -# -# $Id: Makefile,v 1.11 2001/09/25 20:59:41 dwmw2 Exp $ -# - -obj-$(CONFIG_JFFS_FS) += jffs.o - -jffs-y := jffs_fm.o intrep.o inode-v23.o -jffs-$(CONFIG_JFFS_PROC_FS) += jffs_proc.o -jffs-objs := $(jffs-y) diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c deleted file mode 100644 index 43baa1afa02..00000000000 --- a/fs/jffs/inode-v23.c +++ /dev/null @@ -1,1847 +0,0 @@ -/* - * JFFS -- Journalling Flash File System, Linux implementation. - * - * Copyright (C) 1999, 2000 Axis Communications AB. - * - * Created by Finn Hakansson <finn@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: inode-v23.c,v 1.70 2001/10/02 09:16:02 dwmw2 Exp $ - * - * Ported to Linux 2.3.x and MTD: - * Copyright (C) 2000 Alexander Larsson (alex@cendio.se), Cendio Systems AB - * - * Copyright 2000, 2001 Red Hat, Inc. - */ - -/* inode.c -- Contains the code that is called from the VFS. */ - -/* TODO-ALEX: - * uid and gid are just 16 bit. - * jffs_file_write reads from user-space pointers without xx_from_user - * maybe other stuff do to. - */ - -#include <linux/time.h> - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/jffs.h> -#include <linux/fs.h> -#include <linux/smp_lock.h> -#include <linux/ioctl.h> -#include <linux/stat.h> -#include <linux/blkdev.h> -#include <linux/quotaops.h> -#include <linux/highmem.h> -#include <linux/vfs.h> -#include <linux/mutex.h> -#include <asm/byteorder.h> -#include <asm/uaccess.h> - -#include "jffs_fm.h" -#include "intrep.h" -#ifdef CONFIG_JFFS_PROC_FS -#include "jffs_proc.h" -#endif - -static int jffs_remove(struct inode *dir, struct dentry *dentry, int type); - -static struct super_operations jffs_ops; -static const struct file_operations jffs_file_operations; -static struct inode_operations jffs_file_inode_operations; -static const struct file_operations jffs_dir_operations; -static struct inode_operations jffs_dir_inode_operations; -static const struct address_space_operations jffs_address_operations; - -struct kmem_cache *node_cache = NULL; -struct kmem_cache *fm_cache = NULL; - -/* Called by the VFS at mount time to initialize the whole file system. */ -static int jffs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *root_inode; - struct jffs_control *c; - - sb->s_flags |= MS_NODIRATIME; - - D1(printk(KERN_NOTICE "JFFS: Trying to mount device %s.\n", - sb->s_id)); - - if (MAJOR(sb->s_dev) != MTD_BLOCK_MAJOR) { - printk(KERN_WARNING "JFFS: Trying to mount a " - "non-mtd device.\n"); - return -EINVAL; - } - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_fs_info = (void *) 0; - sb->s_maxbytes = 0xFFFFFFFF; - - /* Build the file system. */ - if (jffs_build_fs(sb) < 0) { - goto jffs_sb_err1; - } - - /* - * set up enough so that we can read an inode - */ - sb->s_magic = JFFS_MAGIC_SB_BITMASK; - sb->s_op = &jffs_ops; - - root_inode = iget(sb, JFFS_MIN_INO); - if (!root_inode) - goto jffs_sb_err2; - - /* Get the root directory of this file system. */ - if (!(sb->s_root = d_alloc_root(root_inode))) { - goto jffs_sb_err3; - } - - c = (struct jffs_control *) sb->s_fs_info; - -#ifdef CONFIG_JFFS_PROC_FS - /* Set up the jffs proc file system. */ - if (jffs_register_jffs_proc_dir(MINOR(sb->s_dev), c) < 0) { - printk(KERN_WARNING "JFFS: Failed to initialize the JFFS " - "proc file system for device %s.\n", - sb->s_id); - } -#endif - - /* Set the Garbage Collection thresholds */ - - /* GC if free space goes below 5% of the total size */ - c->gc_minfree_threshold = c->fmc->flash_size / 20; - - if (c->gc_minfree_threshold < c->fmc->sector_size) - c->gc_minfree_threshold = c->fmc->sector_size; - - /* GC if dirty space exceeds 33% of the total size. */ - c->gc_maxdirty_threshold = c->fmc->flash_size / 3; - - if (c->gc_maxdirty_threshold < c->fmc->sector_size) - c->gc_maxdirty_threshold = c->fmc->sector_size; - - - c->thread_pid = kernel_thread (jffs_garbage_collect_thread, - (void *) c, - CLONE_KERNEL); - D1(printk(KERN_NOTICE "JFFS: GC thread pid=%d.\n", (int) c->thread_pid)); - - D1(printk(KERN_NOTICE "JFFS: Successfully mounted device %s.\n", - sb->s_id)); - return 0; - -jffs_sb_err3: - iput(root_inode); -jffs_sb_err2: - jffs_cleanup_control((struct jffs_control *)sb->s_fs_info); -jffs_sb_err1: - printk(KERN_WARNING "JFFS: Failed to mount device %s.\n", - sb->s_id); - return -EINVAL; -} - - -/* This function is called when the file system is umounted. */ -static void -jffs_put_super(struct super_block *sb) -{ - struct jffs_control *c = (struct jffs_control *) sb->s_fs_info; - - D2(printk("jffs_put_super()\n")); - -#ifdef CONFIG_JFFS_PROC_FS - jffs_unregister_jffs_proc_dir(c); -#endif - - if (c->gc_task) { - D1(printk (KERN_NOTICE "jffs_put_super(): Telling gc thread to die.\n")); - send_sig(SIGKILL, c->gc_task, 1); - } - wait_for_completion(&c->gc_thread_comp); - - D1(printk (KERN_NOTICE "jffs_put_super(): Successfully waited on thread.\n")); - - jffs_cleanup_control((struct jffs_control *)sb->s_fs_info); - D1(printk(KERN_NOTICE "JFFS: Successfully unmounted device %s.\n", - sb->s_id)); -} - - -/* This function is called when user commands like chmod, chgrp and - chown are executed. System calls like trunc() results in a call - to this function. */ -static int -jffs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_fmcontrol *fmc; - struct jffs_file *f; - struct jffs_node *new_node; - int update_all; - int res = 0; - int recoverable = 0; - - lock_kernel(); - - if ((res = inode_change_ok(inode, iattr))) - goto out; - - c = (struct jffs_control *)inode->i_sb->s_fs_info; - fmc = c->fmc; - - D3(printk (KERN_NOTICE "notify_change(): down biglock\n")); - mutex_lock(&fmc->biglock); - - f = jffs_find_file(c, inode->i_ino); - - ASSERT(if (!f) { - printk("jffs_setattr(): Invalid inode number: %lu\n", - inode->i_ino); - D3(printk (KERN_NOTICE "notify_change(): up biglock\n")); - mutex_unlock(&fmc->biglock); - res = -EINVAL; - goto out; - }); - - D1(printk("***jffs_setattr(): file: \"%s\", ino: %u\n", - f->name, f->ino)); - - update_all = iattr->ia_valid & ATTR_FORCE; - - if ( (update_all || iattr->ia_valid & ATTR_SIZE) - && (iattr->ia_size + 128 < f->size) ) { - /* We're shrinking the file by more than 128 bytes. - We'll be able to GC and recover this space, so - allow it to go into the reserved space. */ - recoverable = 1; - } - - if (!(new_node = jffs_alloc_node())) { - D(printk("jffs_setattr(): Allocation failed!\n")); - D3(printk (KERN_NOTICE "notify_change(): up biglock\n")); - mutex_unlock(&fmc->biglock); - res = -ENOMEM; - goto out; - } - - new_node->data_offset = 0; - new_node->removed_size = 0; - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = f->ino; - raw_inode.pino = f->pino; - raw_inode.mode = f->mode; - raw_inode.uid = f->uid; - raw_inode.gid = f->gid; - raw_inode.atime = f->atime; - raw_inode.mtime = f->mtime; - raw_inode.ctime = f->ctime; - raw_inode.dsize = 0; - raw_inode.offset = 0; - raw_inode.rsize = 0; - raw_inode.dsize = 0; - raw_inode.nsize = f->nsize; - raw_inode.nlink = f->nlink; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - if (update_all || iattr->ia_valid & ATTR_MODE) { - raw_inode.mode = iattr->ia_mode; - inode->i_mode = iattr->ia_mode; - } - if (update_all || iattr->ia_valid & ATTR_UID) { - raw_inode.uid = iattr->ia_uid; - inode->i_uid = iattr->ia_uid; - } - if (update_all || iattr->ia_valid & ATTR_GID) { - raw_inode.gid = iattr->ia_gid; - inode->i_gid = iattr->ia_gid; - } - if (update_all || iattr->ia_valid & ATTR_SIZE) { - int len; - D1(printk("jffs_notify_change(): Changing size " - "to %lu bytes!\n", (long)iattr->ia_size)); - raw_inode.offset = iattr->ia_size; - - /* Calculate how many bytes need to be removed from - the end. */ - if (f->size < iattr->ia_size) { - len = 0; - } - else { - len = f->size - iattr->ia_size; - } - - raw_inode.rsize = len; - - /* The updated node will be a removal node, with - base at the new size and size of the nbr of bytes - to be removed. */ - new_node->data_offset = iattr->ia_size; - new_node->removed_size = len; - inode->i_size = iattr->ia_size; - inode->i_blocks = (inode->i_size + 511) >> 9; - - if (len) { - invalidate_inode_pages(inode->i_mapping); - } - inode->i_ctime = CURRENT_TIME_SEC; - inode->i_mtime = inode->i_ctime; - } - if (update_all || iattr->ia_valid & ATTR_ATIME) { - raw_inode.atime = iattr->ia_atime.tv_sec; - inode->i_atime = iattr->ia_atime; - } - if (update_all || iattr->ia_valid & ATTR_MTIME) { - raw_inode.mtime = iattr->ia_mtime.tv_sec; - inode->i_mtime = iattr->ia_mtime; - } - if (update_all || iattr->ia_valid & ATTR_CTIME) { - raw_inode.ctime = iattr->ia_ctime.tv_sec; - inode->i_ctime = iattr->ia_ctime; - } - - /* Write this node to the flash. */ - if ((res = jffs_write_node(c, new_node, &raw_inode, f->name, NULL, recoverable, f)) < 0) { - D(printk("jffs_notify_change(): The write failed!\n")); - jffs_free_node(new_node); - D3(printk (KERN_NOTICE "n_c(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - goto out; - } - - jffs_insert_node(c, f, &raw_inode, NULL, new_node); - - mark_inode_dirty(inode); - D3(printk (KERN_NOTICE "n_c(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); -out: - unlock_kernel(); - return res; -} /* jffs_notify_change() */ - - -static struct inode * -jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode, - int * err) -{ - struct super_block * sb; - struct inode * inode; - struct jffs_control *c; - struct jffs_file *f; - - sb = dir->i_sb; - inode = new_inode(sb); - if (!inode) { - *err = -ENOMEM; - return NULL; - } - - c = (struct jffs_control *)sb->s_fs_info; - - inode->i_ino = raw_inode->ino; - inode->i_mode = raw_inode->mode; - inode->i_nlink = raw_inode->nlink; - inode->i_uid = raw_inode->uid; - inode->i_gid = raw_inode->gid; - inode->i_size = raw_inode->dsize; - inode->i_atime.tv_sec = raw_inode->atime; - inode->i_mtime.tv_sec = raw_inode->mtime; - inode->i_ctime.tv_sec = raw_inode->ctime; - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_blocks = (inode->i_size + 511) >> 9; - - f = jffs_find_file(c, raw_inode->ino); - - inode->i_private = (void *)f; - insert_inode_hash(inode); - - return inode; -} - -/* Get statistics of the file system. */ -static int -jffs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info; - struct jffs_fmcontrol *fmc; - - lock_kernel(); - - fmc = c->fmc; - - D2(printk("jffs_statfs()\n")); - - buf->f_type = JFFS_MAGIC_SB_BITMASK; - buf->f_bsize = PAGE_CACHE_SIZE; - buf->f_blocks = (fmc->flash_size / PAGE_CACHE_SIZE) - - (fmc->min_free_size / PAGE_CACHE_SIZE); - buf->f_bfree = (jffs_free_size1(fmc) + jffs_free_size2(fmc) + - fmc->dirty_size - fmc->min_free_size) - >> PAGE_CACHE_SHIFT; - buf->f_bavail = buf->f_bfree; - - /* Find out how many files there are in the filesystem. */ - buf->f_files = jffs_foreach_file(c, jffs_file_count); - buf->f_ffree = buf->f_bfree; - /* buf->f_fsid = 0; */ - buf->f_namelen = JFFS_MAX_NAME_LEN; - - unlock_kernel(); - - return 0; -} - - -/* Rename a file. */ -static int -jffs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_file *old_dir_f; - struct jffs_file *new_dir_f; - struct jffs_file *del_f; - struct jffs_file *f; - struct jffs_node *node; - struct inode *inode; - int result = 0; - __u32 rename_data = 0; - - D2(printk("***jffs_rename()\n")); - - D(printk("jffs_rename(): old_dir: 0x%p, old name: 0x%p, " - "new_dir: 0x%p, new name: 0x%p\n", - old_dir, old_dentry->d_name.name, - new_dir, new_dentry->d_name.name)); - - lock_kernel(); - c = (struct jffs_control *)old_dir->i_sb->s_fs_info; - ASSERT(if (!c) { - printk(KERN_ERR "jffs_rename(): The old_dir inode " - "didn't have a reference to a jffs_file struct\n"); - unlock_kernel(); - return -EIO; - }); - - result = -ENOTDIR; - if (!(old_dir_f = old_dir->i_private)) { - D(printk("jffs_rename(): Old dir invalid.\n")); - goto jffs_rename_end; - } - - /* Try to find the file to move. */ - result = -ENOENT; - if (!(f = jffs_find_child(old_dir_f, old_dentry->d_name.name, - old_dentry->d_name.len))) { - goto jffs_rename_end; - } - - /* Find the new directory. */ - result = -ENOTDIR; - if (!(new_dir_f = new_dir->i_private)) { - D(printk("jffs_rename(): New dir invalid.\n")); - goto jffs_rename_end; - } - D3(printk (KERN_NOTICE "rename(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - /* Create a node and initialize as much as needed. */ - result = -ENOMEM; - if (!(node = jffs_alloc_node())) { - D(printk("jffs_rename(): Allocation failed: node == 0\n")); - goto jffs_rename_end; - } - node->data_offset = 0; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = f->ino; - raw_inode.pino = new_dir_f->ino; -/* raw_inode.version = f->highest_version + 1; */ - raw_inode.mode = f->mode; - raw_inode.uid = current->fsuid; - raw_inode.gid = current->fsgid; -#if 0 - raw_inode.uid = f->uid; - raw_inode.gid = f->gid; -#endif - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = f->ctime; - raw_inode.offset = 0; - raw_inode.dsize = 0; - raw_inode.rsize = 0; - raw_inode.nsize = new_dentry->d_name.len; - raw_inode.nlink = f->nlink; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - /* See if there already exists a file with the same name as - new_name. */ - if ((del_f = jffs_find_child(new_dir_f, new_dentry->d_name.name, - new_dentry->d_name.len))) { - raw_inode.rename = 1; - raw_inode.dsize = sizeof(__u32); - rename_data = del_f->ino; - } - - /* Write the new node to the flash memory. */ - if ((result = jffs_write_node(c, node, &raw_inode, - new_dentry->d_name.name, - (unsigned char*)&rename_data, 0, f)) < 0) { - D(printk("jffs_rename(): Failed to write node to flash.\n")); - jffs_free_node(node); - goto jffs_rename_end; - } - raw_inode.dsize = 0; - - if (raw_inode.rename) { - /* The file with the same name must be deleted. */ - //FIXME deadlock down(&c->fmc->gclock); - if ((result = jffs_remove(new_dir, new_dentry, - del_f->mode)) < 0) { - /* This is really bad. */ - printk(KERN_ERR "JFFS: An error occurred in " - "rename().\n"); - } - // up(&c->fmc->gclock); - } - - if (old_dir_f != new_dir_f) { - /* Remove the file from its old position in the - filesystem tree. */ - jffs_unlink_file_from_tree(f); - } - - /* Insert the new node into the file system. */ - if ((result = jffs_insert_node(c, f, &raw_inode, - new_dentry->d_name.name, node)) < 0) { - D(printk(KERN_ERR "jffs_rename(): jffs_insert_node() " - "failed!\n")); - } - - if (old_dir_f != new_dir_f) { - /* Insert the file to its new position in the - file system. */ - jffs_insert_file_into_tree(f); - } - - /* This is a kind of update of the inode we're about to make - here. This is what they do in ext2fs. Kind of. */ - if ((inode = iget(new_dir->i_sb, f->ino))) { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - iput(inode); - } - -jffs_rename_end: - D3(printk (KERN_NOTICE "rename(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return result; -} /* jffs_rename() */ - - -/* Read the contents of a directory. Used by programs like `ls' - for instance. */ -static int -jffs_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct jffs_file *f; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; - int j; - int ddino; - lock_kernel(); - D3(printk (KERN_NOTICE "readdir(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - D2(printk("jffs_readdir(): inode: 0x%p, filp: 0x%p\n", inode, filp)); - if (filp->f_pos == 0) { - D3(printk("jffs_readdir(): \".\" %lu\n", inode->i_ino)); - if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) { - D3(printk (KERN_NOTICE "readdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return 0; - } - filp->f_pos = 1; - } - if (filp->f_pos == 1) { - if (inode->i_ino == JFFS_MIN_INO) { - ddino = JFFS_MIN_INO; - } - else { - ddino = ((struct jffs_file *) - inode->i_private)->pino; - } - D3(printk("jffs_readdir(): \"..\" %u\n", ddino)); - if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) { - D3(printk (KERN_NOTICE "readdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return 0; - } - filp->f_pos++; - } - f = ((struct jffs_file *)inode->i_private)->children; - - j = 2; - while(f && (f->deleted || j++ < filp->f_pos )) { - f = f->sibling_next; - } - - while (f) { - D3(printk("jffs_readdir(): \"%s\" ino: %u\n", - (f->name ? f->name : ""), f->ino)); - if (filldir(dirent, f->name, f->nsize, - filp->f_pos , f->ino, DT_UNKNOWN) < 0) { - D3(printk (KERN_NOTICE "readdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return 0; - } - filp->f_pos++; - do { - f = f->sibling_next; - } while(f && f->deleted); - } - D3(printk (KERN_NOTICE "readdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return filp->f_pos; -} /* jffs_readdir() */ - - -/* Find a file in a directory. If the file exists, return its - corresponding dentry. */ -static struct dentry * -jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - struct jffs_file *d; - struct jffs_file *f; - struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info; - int len; - int r = 0; - const char *name; - struct inode *inode = NULL; - - len = dentry->d_name.len; - name = dentry->d_name.name; - - lock_kernel(); - - D3({ - char *s = kmalloc(len + 1, GFP_KERNEL); - memcpy(s, name, len); - s[len] = '\0'; - printk("jffs_lookup(): dir: 0x%p, name: \"%s\"\n", dir, s); - kfree(s); - }); - - D3(printk (KERN_NOTICE "lookup(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - r = -ENAMETOOLONG; - if (len > JFFS_MAX_NAME_LEN) { - goto jffs_lookup_end; - } - - r = -EACCES; - if (!(d = (struct jffs_file *)dir->i_private)) { - D(printk("jffs_lookup(): No such inode! (%lu)\n", - dir->i_ino)); - goto jffs_lookup_end; - } - - /* Get the corresponding inode to the file. */ - - /* iget calls jffs_read_inode, so we need to drop the biglock - before calling iget. Unfortunately, the GC has a tendency - to sneak in here, because iget sometimes calls schedule (). - */ - - if ((len == 1) && (name[0] == '.')) { - D3(printk (KERN_NOTICE "lookup(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - if (!(inode = iget(dir->i_sb, d->ino))) { - D(printk("jffs_lookup(): . iget() ==> NULL\n")); - goto jffs_lookup_end_no_biglock; - } - D3(printk (KERN_NOTICE "lookup(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - } else if ((len == 2) && (name[0] == '.') && (name[1] == '.')) { - D3(printk (KERN_NOTICE "lookup(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - if (!(inode = iget(dir->i_sb, d->pino))) { - D(printk("jffs_lookup(): .. iget() ==> NULL\n")); - goto jffs_lookup_end_no_biglock; - } - D3(printk (KERN_NOTICE "lookup(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - } else if ((f = jffs_find_child(d, name, len))) { - D3(printk (KERN_NOTICE "lookup(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - if (!(inode = iget(dir->i_sb, f->ino))) { - D(printk("jffs_lookup(): iget() ==> NULL\n")); - goto jffs_lookup_end_no_biglock; - } - D3(printk (KERN_NOTICE "lookup(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - } else { - D3(printk("jffs_lookup(): Couldn't find the file. " - "f = 0x%p, name = \"%s\", d = 0x%p, d->ino = %u\n", - f, name, d, d->ino)); - inode = NULL; - } - - d_add(dentry, inode); - D3(printk (KERN_NOTICE "lookup(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return NULL; - -jffs_lookup_end: - D3(printk (KERN_NOTICE "lookup(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - -jffs_lookup_end_no_biglock: - unlock_kernel(); - return ERR_PTR(r); -} /* jffs_lookup() */ - - -/* Try to read a page of data from a file. */ -static int -jffs_do_readpage_nolock(struct file *file, struct page *page) -{ - void *buf; - unsigned long read_len; - int result; - struct inode *inode = (struct inode*)page->mapping->host; - struct jffs_file *f = (struct jffs_file *)inode->i_private; - struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; - int r; - loff_t offset; - - D2(printk("***jffs_readpage(): file = \"%s\", page->index = %lu\n", - (f->name ? f->name : ""), (long)page->index)); - - get_page(page); - /* Don't SetPageLocked(page), should be locked already */ - ClearPageUptodate(page); - ClearPageError(page); - - D3(printk (KERN_NOTICE "readpage(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - read_len = 0; - result = 0; - offset = page_offset(page); - - kmap(page); - buf = page_address(page); - if (offset < inode->i_size) { - read_len = min_t(long, inode->i_size - offset, PAGE_SIZE); - r = jffs_read_data(f, buf, offset, read_len); - if (r != read_len) { - result = -EIO; - D( - printk("***jffs_readpage(): Read error! " - "Wanted to read %lu bytes but only " - "read %d bytes.\n", read_len, r); - ); - } - - } - - /* This handles the case of partial or no read in above */ - if(read_len < PAGE_SIZE) - memset(buf + read_len, 0, PAGE_SIZE - read_len); - flush_dcache_page(page); - kunmap(page); - - D3(printk (KERN_NOTICE "readpage(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - - if (result) { - SetPageError(page); - }else { - SetPageUptodate(page); - } - - page_cache_release(page); - - D3(printk("jffs_readpage(): Leaving...\n")); - - return result; -} /* jffs_do_readpage_nolock() */ - -static int jffs_readpage(struct file *file, struct page *page) -{ - int ret = jffs_do_readpage_nolock(file, page); - unlock_page(page); - return ret; -} - -/* Create a new directory. */ -static int -jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_node *node; - struct jffs_file *dir_f; - struct inode *inode; - int dir_mode; - int result = 0; - int err; - - D1({ - int len = dentry->d_name.len; - char *_name = kmalloc(len + 1, GFP_KERNEL); - memcpy(_name, dentry->d_name.name, len); - _name[len] = '\0'; - printk("***jffs_mkdir(): dir = 0x%p, name = \"%s\", " - "len = %d, mode = 0x%08x\n", dir, _name, len, mode); - kfree(_name); - }); - - lock_kernel(); - dir_f = dir->i_private; - - ASSERT(if (!dir_f) { - printk(KERN_ERR "jffs_mkdir(): No reference to a " - "jffs_file struct in inode.\n"); - unlock_kernel(); - return -EIO; - }); - - c = dir_f->c; - D3(printk (KERN_NOTICE "mkdir(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - dir_mode = S_IFDIR | (mode & (S_IRWXUGO|S_ISVTX) - & ~current->fs->umask); - if (dir->i_mode & S_ISGID) { - dir_mode |= S_ISGID; - } - - /* Create a node and initialize it as much as needed. */ - if (!(node = jffs_alloc_node())) { - D(printk("jffs_mkdir(): Allocation failed: node == 0\n")); - result = -ENOMEM; - goto jffs_mkdir_end; - } - node->data_offset = 0; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = c->next_ino++; - raw_inode.pino = dir_f->ino; - raw_inode.version = 1; - raw_inode.mode = dir_mode; - raw_inode.uid = current->fsuid; - raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; - /* raw_inode.gid = current->fsgid; */ - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = raw_inode.atime; - raw_inode.offset = 0; - raw_inode.dsize = 0; - raw_inode.rsize = 0; - raw_inode.nsize = dentry->d_name.len; - raw_inode.nlink = 1; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - /* Write the new node to the flash. */ - if ((result = jffs_write_node(c, node, &raw_inode, - dentry->d_name.name, NULL, 0, NULL)) < 0) { - D(printk("jffs_mkdir(): jffs_write_node() failed.\n")); - jffs_free_node(node); - goto jffs_mkdir_end; - } - - /* Insert the new node into the file system. */ - if ((result = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name, - node)) < 0) { - goto jffs_mkdir_end; - } - - inode = jffs_new_inode(dir, &raw_inode, &err); - if (inode == NULL) { - result = err; - goto jffs_mkdir_end; - } - - inode->i_op = &jffs_dir_inode_operations; - inode->i_fop = &jffs_dir_operations; - - mark_inode_dirty(dir); - d_instantiate(dentry, inode); - - result = 0; -jffs_mkdir_end: - D3(printk (KERN_NOTICE "mkdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return result; -} /* jffs_mkdir() */ - - -/* Remove a directory. */ -static int -jffs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info; - int ret; - D3(printk("***jffs_rmdir()\n")); - D3(printk (KERN_NOTICE "rmdir(): down biglock\n")); - lock_kernel(); - mutex_lock(&c->fmc->biglock); - ret = jffs_remove(dir, dentry, S_IFDIR); - D3(printk (KERN_NOTICE "rmdir(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return ret; -} - - -/* Remove any kind of file except for directories. */ -static int -jffs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info; - int ret; - - lock_kernel(); - D3(printk("***jffs_unlink()\n")); - D3(printk (KERN_NOTICE "unlink(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - ret = jffs_remove(dir, dentry, 0); - D3(printk (KERN_NOTICE "unlink(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return ret; -} - - -/* Remove a JFFS entry, i.e. plain files, directories, etc. Here we - shouldn't test for free space on the device. */ -static int -jffs_remove(struct inode *dir, struct dentry *dentry, int type) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_file *dir_f; /* The file-to-remove's parent. */ - struct jffs_file *del_f; /* The file to remove. */ - struct jffs_node *del_node; - struct inode *inode = NULL; - int result = 0; - - D1({ - int len = dentry->d_name.len; - const char *name = dentry->d_name.name; - char *_name = kmalloc(len + 1, GFP_KERNEL); - memcpy(_name, name, len); - _name[len] = '\0'; - printk("***jffs_remove(): file = \"%s\", ino = %ld\n", _name, dentry->d_inode->i_ino); - kfree(_name); - }); - - dir_f = dir->i_private; - c = dir_f->c; - - result = -ENOENT; - if (!(del_f = jffs_find_child(dir_f, dentry->d_name.name, - dentry->d_name.len))) { - D(printk("jffs_remove(): jffs_find_child() failed.\n")); - goto jffs_remove_end; - } - - if (S_ISDIR(type)) { - struct jffs_file *child = del_f->children; - while(child) { - if( !child->deleted ) { - result = -ENOTEMPTY; - goto jffs_remove_end; - } - child = child->sibling_next; - } - } - else if (S_ISDIR(del_f->mode)) { - D(printk("jffs_remove(): node is a directory " - "but it shouldn't be.\n")); - result = -EPERM; - goto jffs_remove_end; - } - - inode = dentry->d_inode; - - result = -EIO; - if (del_f->ino != inode->i_ino) - goto jffs_remove_end; - - if (!inode->i_nlink) { - printk("Deleting nonexistent file inode: %lu, nlink: %d\n", - inode->i_ino, inode->i_nlink); - inode->i_nlink=1; - } - - /* Create a node for the deletion. */ - result = -ENOMEM; - if (!(del_node = jffs_alloc_node())) { - D(printk("jffs_remove(): Allocation failed!\n")); - goto jffs_remove_end; - } - del_node->data_offset = 0; - del_node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = del_f->ino; - raw_inode.pino = del_f->pino; -/* raw_inode.version = del_f->highest_version + 1; */ - raw_inode.mode = del_f->mode; - raw_inode.uid = current->fsuid; - raw_inode.gid = current->fsgid; - raw_inode.atime = get_seconds(); - raw_inode.mtime = del_f->mtime; - raw_inode.ctime = raw_inode.atime; - raw_inode.offset = 0; - raw_inode.dsize = 0; - raw_inode.rsize = 0; - raw_inode.nsize = 0; - raw_inode.nlink = del_f->nlink; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 1; - - /* Write the new node to the flash memory. */ - if (jffs_write_node(c, del_node, &raw_inode, NULL, NULL, 1, del_f) < 0) { - jffs_free_node(del_node); - result = -EIO; - goto jffs_remove_end; - } - - /* Update the file. This operation will make the file disappear - from the in-memory file system structures. */ - jffs_insert_node(c, del_f, &raw_inode, NULL, del_node); - - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); - - d_delete(dentry); /* This also frees the inode */ - - result = 0; -jffs_remove_end: - return result; -} /* jffs_remove() */ - - -static int -jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) -{ - struct jffs_raw_inode raw_inode; - struct jffs_file *dir_f; - struct jffs_node *node = NULL; - struct jffs_control *c; - struct inode *inode; - int result = 0; - u16 data = old_encode_dev(rdev); - int err; - - D1(printk("***jffs_mknod()\n")); - - if (!old_valid_dev(rdev)) - return -EINVAL; - lock_kernel(); - dir_f = dir->i_private; - c = dir_f->c; - - D3(printk (KERN_NOTICE "mknod(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - /* Create and initialize a new node. */ - if (!(node = jffs_alloc_node())) { - D(printk("jffs_mknod(): Allocation failed!\n")); - result = -ENOMEM; - goto jffs_mknod_err; - } - node->data_offset = 0; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = c->next_ino++; - raw_inode.pino = dir_f->ino; - raw_inode.version = 1; - raw_inode.mode = mode; - raw_inode.uid = current->fsuid; - raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; - /* raw_inode.gid = current->fsgid; */ - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = raw_inode.atime; - raw_inode.offset = 0; - raw_inode.dsize = 2; - raw_inode.rsize = 0; - raw_inode.nsize = dentry->d_name.len; - raw_inode.nlink = 1; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - /* Write the new node to the flash. */ - if ((err = jffs_write_node(c, node, &raw_inode, dentry->d_name.name, - (unsigned char *)&data, 0, NULL)) < 0) { - D(printk("jffs_mknod(): jffs_write_node() failed.\n")); - result = err; - goto jffs_mknod_err; - } - - /* Insert the new node into the file system. */ - if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name, - node)) < 0) { - result = err; - goto jffs_mknod_end; - } - - inode = jffs_new_inode(dir, &raw_inode, &err); - if (inode == NULL) { - result = err; - goto jffs_mknod_end; - } - - init_special_inode(inode, mode, rdev); - - d_instantiate(dentry, inode); - - goto jffs_mknod_end; - -jffs_mknod_err: - if (node) { - jffs_free_node(node); - } - -jffs_mknod_end: - D3(printk (KERN_NOTICE "mknod(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return result; -} /* jffs_mknod() */ - - -static int -jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_file *dir_f; - struct jffs_node *node; - struct inode *inode; - - int symname_len = strlen(symname); - int err; - - lock_kernel(); - D1({ - int len = dentry->d_name.len; - char *_name = kmalloc(len + 1, GFP_KERNEL); - char *_symname = kmalloc(symname_len + 1, GFP_KERNEL); - memcpy(_name, dentry->d_name.name, len); - _name[len] = '\0'; - memcpy(_symname, symname, symname_len); - _symname[symname_len] = '\0'; - printk("***jffs_symlink(): dir = 0x%p, " - "dentry->dname.name = \"%s\", " - "symname = \"%s\"\n", dir, _name, _symname); - kfree(_name); - kfree(_symname); - }); - - dir_f = dir->i_private; - ASSERT(if (!dir_f) { - printk(KERN_ERR "jffs_symlink(): No reference to a " - "jffs_file struct in inode.\n"); - unlock_kernel(); - return -EIO; - }); - - c = dir_f->c; - - /* Create a node and initialize it as much as needed. */ - if (!(node = jffs_alloc_node())) { - D(printk("jffs_symlink(): Allocation failed: node = NULL\n")); - unlock_kernel(); - return -ENOMEM; - } - D3(printk (KERN_NOTICE "symlink(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - node->data_offset = 0; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = c->next_ino++; - raw_inode.pino = dir_f->ino; - raw_inode.version = 1; - raw_inode.mode = S_IFLNK | S_IRWXUGO; - raw_inode.uid = current->fsuid; - raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = raw_inode.atime; - raw_inode.offset = 0; - raw_inode.dsize = symname_len; - raw_inode.rsize = 0; - raw_inode.nsize = dentry->d_name.len; - raw_inode.nlink = 1; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - /* Write the new node to the flash. */ - if ((err = jffs_write_node(c, node, &raw_inode, dentry->d_name.name, - (const unsigned char *)symname, 0, NULL)) < 0) { - D(printk("jffs_symlink(): jffs_write_node() failed.\n")); - jffs_free_node(node); - goto jffs_symlink_end; - } - - /* Insert the new node into the file system. */ - if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name, - node)) < 0) { - goto jffs_symlink_end; - } - - inode = jffs_new_inode(dir, &raw_inode, &err); - if (inode == NULL) { - goto jffs_symlink_end; - } - err = 0; - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &jffs_address_operations; - - d_instantiate(dentry, inode); - jffs_symlink_end: - D3(printk (KERN_NOTICE "symlink(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return err; -} /* jffs_symlink() */ - - -/* Create an inode inside a JFFS directory (dir) and return it. - * - * By the time this is called, we already have created - * the directory cache entry for the new file, but it - * is so far negative - it has no inode. - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int -jffs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_node *node; - struct jffs_file *dir_f; /* JFFS representation of the directory. */ - struct inode *inode; - int err; - - lock_kernel(); - D1({ - int len = dentry->d_name.len; - char *s = kmalloc(len + 1, GFP_KERNEL); - memcpy(s, dentry->d_name.name, len); - s[len] = '\0'; - printk("jffs_create(): dir: 0x%p, name: \"%s\"\n", dir, s); - kfree(s); - }); - - dir_f = dir->i_private; - ASSERT(if (!dir_f) { - printk(KERN_ERR "jffs_create(): No reference to a " - "jffs_file struct in inode.\n"); - unlock_kernel(); - return -EIO; - }); - - c = dir_f->c; - - /* Create a node and initialize as much as needed. */ - if (!(node = jffs_alloc_node())) { - D(printk("jffs_create(): Allocation failed: node == 0\n")); - unlock_kernel(); - return -ENOMEM; - } - D3(printk (KERN_NOTICE "create(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - node->data_offset = 0; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = c->next_ino++; - raw_inode.pino = dir_f->ino; - raw_inode.version = 1; - raw_inode.mode = mode; - raw_inode.uid = current->fsuid; - raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = raw_inode.atime; - raw_inode.offset = 0; - raw_inode.dsize = 0; - raw_inode.rsize = 0; - raw_inode.nsize = dentry->d_name.len; - raw_inode.nlink = 1; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - /* Write the new node to the flash. */ - if ((err = jffs_write_node(c, node, &raw_inode, - dentry->d_name.name, NULL, 0, NULL)) < 0) { - D(printk("jffs_create(): jffs_write_node() failed.\n")); - jffs_free_node(node); - goto jffs_create_end; - } - - /* Insert the new node into the file system. */ - if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name, - node)) < 0) { - goto jffs_create_end; - } - - /* Initialize an inode. */ - inode = jffs_new_inode(dir, &raw_inode, &err); - if (inode == NULL) { - goto jffs_create_end; - } - err = 0; - inode->i_op = &jffs_file_inode_operations; - inode->i_fop = &jffs_file_operations; - inode->i_mapping->a_ops = &jffs_address_operations; - inode->i_mapping->nrpages = 0; - - d_instantiate(dentry, inode); - jffs_create_end: - D3(printk (KERN_NOTICE "create(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - unlock_kernel(); - return err; -} /* jffs_create() */ - - -/* Write, append or rewrite data to an existing file. */ -static ssize_t -jffs_file_write(struct file *filp, const char *buf, size_t count, - loff_t *ppos) -{ - struct jffs_raw_inode raw_inode; - struct jffs_control *c; - struct jffs_file *f; - struct jffs_node *node; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - int recoverable = 0; - size_t written = 0; - __u32 thiscount = count; - loff_t pos = *ppos; - int err; - - inode = filp->f_path.dentry->d_inode; - - D2(printk("***jffs_file_write(): inode: 0x%p (ino: %lu), " - "filp: 0x%p, buf: 0x%p, count: %d\n", - inode, inode->i_ino, filp, buf, count)); - -#if 0 - if (inode->i_sb->s_flags & MS_RDONLY) { - D(printk("jffs_file_write(): MS_RDONLY\n")); - err = -EROFS; - goto out_isem; - } -#endif - err = -EINVAL; - - if (!S_ISREG(inode->i_mode)) { - D(printk("jffs_file_write(): inode->i_mode == 0x%08x\n", - inode->i_mode)); - goto out_isem; - } - - if (!(f = inode->i_private)) { - D(printk("jffs_file_write(): inode->i_private = 0x%p\n", - inode->i_private)); - goto out_isem; - } - - c = f->c; - - /* - * This will never trigger with sane page sizes. leave it in - * anyway, since I'm thinking about how to merge larger writes - * (the current idea is to poke a thread that does the actual - * I/O and starts by doing a mutex_lock(&inode->i_mutex). then we - * would need to get the page cache pages and have a list of - * I/O requests and do write-merging here. - * -- prumpf - */ - thiscount = min(c->fmc->max_chunk_size - sizeof(struct jffs_raw_inode), count); - - D3(printk (KERN_NOTICE "file_write(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - /* Urgh. POSIX says we can do short writes if we feel like it. - * In practice, we can't. Nothing will cope. So we loop until - * we're done. - * - * <_Anarchy_> posix and reality are not interconnected on this issue - */ - while (count) { - /* Things are going to be written so we could allocate and - initialize the necessary data structures now. */ - if (!(node = jffs_alloc_node())) { - D(printk("jffs_file_write(): node == 0\n")); - err = -ENOMEM; - goto out; - } - - node->data_offset = pos; - node->removed_size = 0; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = f->ino; - raw_inode.pino = f->pino; - - raw_inode.mode = f->mode; - - raw_inode.uid = f->uid; - raw_inode.gid = f->gid; - raw_inode.atime = get_seconds(); - raw_inode.mtime = raw_inode.atime; - raw_inode.ctime = f->ctime; - raw_inode.offset = pos; - raw_inode.dsize = thiscount; - raw_inode.rsize = 0; - raw_inode.nsize = f->nsize; - raw_inode.nlink = f->nlink; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = 0; - - if (pos < f->size) { - node->removed_size = raw_inode.rsize = min(thiscount, (__u32)(f->size - pos)); - - /* If this node is going entirely over the top of old data, - we can allow it to go into the reserved space, because - we know that GC can reclaim the space later. - */ - if (pos + thiscount < f->size) { - /* If all the data we're overwriting are _real_, - not just holes, then: - recoverable = 1; - */ - } - } - - /* Write the new node to the flash. */ - /* NOTE: We would be quite happy if jffs_write_node() wrote a - smaller node than we were expecting. There's no need for it - to waste the space at the end of the flash just because it's - a little smaller than what we asked for. But that's a whole - new can of worms which I'm not going to open this week. - -- dwmw2. - */ - if ((err = jffs_write_node(c, node, &raw_inode, f->name, - (const unsigned char *)buf, - recoverable, f)) < 0) { - D(printk("jffs_file_write(): jffs_write_node() failed.\n")); - jffs_free_node(node); - goto out; - } - - written += err; - buf += err; - count -= err; - pos += err; - - /* Insert the new node into the file system. */ - if ((err = jffs_insert_node(c, f, &raw_inode, NULL, node)) < 0) { - goto out; - } - - D3(printk("jffs_file_write(): new f_pos %ld.\n", (long)pos)); - - thiscount = min(c->fmc->max_chunk_size - sizeof(struct jffs_raw_inode), count); - } - out: - D3(printk (KERN_NOTICE "file_write(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - - /* Fix things in the real inode. */ - if (pos > inode->i_size) { - inode->i_size = pos; - inode->i_blocks = (inode->i_size + 511) >> 9; - } - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - invalidate_inode_pages(inode->i_mapping); - - out_isem: - return err; -} /* jffs_file_write() */ - -static int -jffs_prepare_write(struct file *filp, struct page *page, - unsigned from, unsigned to) -{ - /* FIXME: we should detect some error conditions here */ - - /* Bugger that. We should make sure the page is uptodate */ - if (!PageUptodate(page) && (from || to < PAGE_CACHE_SIZE)) - return jffs_do_readpage_nolock(filp, page); - - return 0; -} /* jffs_prepare_write() */ - -static int -jffs_commit_write(struct file *filp, struct page *page, - unsigned from, unsigned to) -{ - void *addr = page_address(page) + from; - /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */ - loff_t pos = page_offset(page) + from; - - return jffs_file_write(filp, addr, to-from, &pos); -} /* jffs_commit_write() */ - -/* This is our ioctl() routine. */ -static int -jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) -{ - struct jffs_control *c; - int ret = 0; - - D2(printk("***jffs_ioctl(): cmd = 0x%08x, arg = 0x%08lx\n", - cmd, arg)); - - if (!(c = (struct jffs_control *)inode->i_sb->s_fs_info)) { - printk(KERN_ERR "JFFS: Bad inode in ioctl() call. " - "(cmd = 0x%08x)\n", cmd); - return -EIO; - } - D3(printk (KERN_NOTICE "ioctl(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - - switch (cmd) { - case JFFS_PRINT_HASH: - jffs_print_hash_table(c); - break; - case JFFS_PRINT_TREE: - jffs_print_tree(c->root, 0); - break; - case JFFS_GET_STATUS: - { - struct jffs_flash_status fst; - struct jffs_fmcontrol *fmc = c->fmc; - printk("Flash status -- "); - if (!access_ok(VERIFY_WRITE, - (struct jffs_flash_status __user *)arg, - sizeof(struct jffs_flash_status))) { - D(printk("jffs_ioctl(): Bad arg in " - "JFFS_GET_STATUS ioctl!\n")); - ret = -EFAULT; - break; - } - fst.size = fmc->flash_size; - fst.used = fmc->used_size; - fst.dirty = fmc->dirty_size; - fst.begin = fmc->head->offset; - fst.end = fmc->tail->offset + fmc->tail->size; - printk("size: %d, used: %d, dirty: %d, " - "begin: %d, end: %d\n", - fst.size, fst.used, fst.dirty, - fst.begin, fst.end); - if (copy_to_user((struct jffs_flash_status __user *)arg, - &fst, - sizeof(struct jffs_flash_status))) { - ret = -EFAULT; - } - } - break; - default: - ret = -ENOTTY; - } - D3(printk (KERN_NOTICE "ioctl(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - return ret; -} /* jffs_ioctl() */ - - -static const struct address_space_operations jffs_address_operations = { - .readpage = jffs_readpage, - .prepare_write = jffs_prepare_write, - .commit_write = jffs_commit_write, -}; - -static int jffs_fsync(struct file *f, struct dentry *d, int datasync) -{ - /* We currently have O_SYNC operations at all times. - Do nothing. - */ - return 0; -} - - -static const struct file_operations jffs_file_operations = -{ - .open = generic_file_open, - .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .ioctl = jffs_ioctl, - .mmap = generic_file_readonly_mmap, - .fsync = jffs_fsync, - .sendfile = generic_file_sendfile, -}; - - -static struct inode_operations jffs_file_inode_operations = -{ - .lookup = jffs_lookup, /* lookup */ - .setattr = jffs_setattr, -}; - - -static const struct file_operations jffs_dir_operations = -{ - .readdir = jffs_readdir, -}; - - -static struct inode_operations jffs_dir_inode_operations = -{ - .create = jffs_create, - .lookup = jffs_lookup, - .unlink = jffs_unlink, - .symlink = jffs_symlink, - .mkdir = jffs_mkdir, - .rmdir = jffs_rmdir, - .mknod = jffs_mknod, - .rename = jffs_rename, - .setattr = jffs_setattr, -}; - - -/* Initialize an inode for the VFS. */ -static void -jffs_read_inode(struct inode *inode) -{ - struct jffs_file *f; - struct jffs_control *c; - - D3(printk("jffs_read_inode(): inode->i_ino == %lu\n", inode->i_ino)); - - if (!inode->i_sb) { - D(printk("jffs_read_inode(): !inode->i_sb ==> " - "No super block!\n")); - return; - } - c = (struct jffs_control *)inode->i_sb->s_fs_info; - D3(printk (KERN_NOTICE "read_inode(): down biglock\n")); - mutex_lock(&c->fmc->biglock); - if (!(f = jffs_find_file(c, inode->i_ino))) { - D(printk("jffs_read_inode(): No such inode (%lu).\n", - inode->i_ino)); - D3(printk (KERN_NOTICE "read_inode(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); - return; - } - inode->i_private = f; - inode->i_mode = f->mode; - inode->i_nlink = f->nlink; - inode->i_uid = f->uid; - inode->i_gid = f->gid; - inode->i_size = f->size; - inode->i_atime.tv_sec = f->atime; - inode->i_mtime.tv_sec = f->mtime; - inode->i_ctime.tv_sec = f->ctime; - inode->i_atime.tv_nsec = - inode->i_mtime.tv_nsec = - inode->i_ctime.tv_nsec = 0; - - inode->i_blocks = (inode->i_size + 511) >> 9; - if (S_ISREG(inode->i_mode)) { - inode->i_op = &jffs_file_inode_operations; - inode->i_fop = &jffs_file_operations; - inode->i_mapping->a_ops = &jffs_address_operations; - } - else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &jffs_dir_inode_operations; - inode->i_fop = &jffs_dir_operations; - } - else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &jffs_address_operations; - } - else { - /* If the node is a device of some sort, then the number of - the device should be read from the flash memory and then - added to the inode's i_rdev member. */ - u16 val; - jffs_read_data(f, (char *)&val, 0, 2); - init_special_inode(inode, inode->i_mode, - old_decode_dev(val)); - } - - D3(printk (KERN_NOTICE "read_inode(): up biglock\n")); - mutex_unlock(&c->fmc->biglock); -} - - -static void -jffs_delete_inode(struct inode *inode) -{ - struct jffs_file *f; - struct jffs_control *c; - D3(printk("jffs_delete_inode(): inode->i_ino == %lu\n", - inode->i_ino)); - - truncate_inode_pages(&inode->i_data, 0); - lock_kernel(); - inode->i_size = 0; - inode->i_blocks = 0; - inode->i_private = NULL; - clear_inode(inode); - if (inode->i_nlink == 0) { - c = (struct jffs_control *) inode->i_sb->s_fs_info; - f = (struct jffs_file *) jffs_find_file (c, inode->i_ino); - jffs_possibly_delete_file(f); - } - - unlock_kernel(); -} - - -static void -jffs_write_super(struct super_block *sb) -{ - struct jffs_control *c = (struct jffs_control *)sb->s_fs_info; - lock_kernel(); - jffs_garbage_collect_trigger(c); - unlock_kernel(); -} - -static int jffs_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_NODIRATIME; - return 0; -} - -static struct super_operations jffs_ops = -{ - .read_inode = jffs_read_inode, - .delete_inode = jffs_delete_inode, - .put_super = jffs_put_super, - .write_super = jffs_write_super, - .statfs = jffs_statfs, - .remount_fs = jffs_remount, -}; - -static int jffs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super, - mnt); -} - -static struct file_system_type jffs_fs_type = { - .owner = THIS_MODULE, - .name = "jffs", - .get_sb = jffs_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -static int __init -init_jffs_fs(void) -{ - printk(KERN_INFO "JFFS version " JFFS_VERSION_STRING - ", (C) 1999, 2000 Axis Communications AB\n"); - -#ifdef CONFIG_JFFS_PROC_FS - jffs_proc_root = proc_mkdir("jffs", proc_root_fs); - if (!jffs_proc_root) { - printk(KERN_WARNING "cannot create /proc/jffs entry\n"); - } -#endif - fm_cache = kmem_cache_create("jffs_fm", sizeof(struct jffs_fm), - 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - NULL, NULL); - if (!fm_cache) { - return -ENOMEM; - } - - node_cache = kmem_cache_create("jffs_node",sizeof(struct jffs_node), - 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - NULL, NULL); - if (!node_cache) { - kmem_cache_destroy(fm_cache); - return -ENOMEM; - } - - return register_filesystem(&jffs_fs_type); -} - -static void __exit -exit_jffs_fs(void) -{ - unregister_filesystem(&jffs_fs_type); - kmem_cache_destroy(fm_cache); - kmem_cache_destroy(node_cache); -} - -module_init(init_jffs_fs) -module_exit(exit_jffs_fs) - -MODULE_DESCRIPTION("The Journalling Flash File System"); -MODULE_AUTHOR("Axis Communications AB."); -MODULE_LICENSE("GPL"); diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c deleted file mode 100644 index 6dd18911b44..00000000000 --- a/fs/jffs/intrep.c +++ /dev/null @@ -1,3449 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 1999, 2000 Axis Communications, Inc. - * - * Created by Finn Hakansson <finn@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: intrep.c,v 1.102 2001/09/23 23:28:36 dwmw2 Exp $ - * - * Ported to Linux 2.3.x and MTD: - * Copyright (C) 2000 Alexander Larsson (alex@cendio.se), Cendio Systems AB - * - */ - -/* This file contains the code for the internal structure of the - Journaling Flash File System, JFFS. */ - -/* - * Todo list: - * - * memcpy_to_flash() and memcpy_from_flash() functions. - * - * Implementation of hard links. - * - * Organize the source code in a better way. Against the VFS we could - * have jffs_ext.c, and against the block device jffs_int.c. - * A better file-internal organization too. - * - * A better checksum algorithm. - * - * Consider endianness stuff. ntohl() etc. - * - * Are we handling the atime, mtime, ctime members of the inode right? - * - * Remove some duplicated code. Take a look at jffs_write_node() and - * jffs_rewrite_data() for instance. - * - * Implement more meaning of the nlink member in various data structures. - * nlink could be used in conjunction with hard links for instance. - * - * Better memory management. Allocate data structures in larger chunks - * if possible. - * - * If too much meta data is stored, a garbage collect should be issued. - * We have experienced problems with too much meta data with for instance - * log files. - * - * Improve the calls to jffs_ioctl(). We would like to retrieve more - * information to be able to debug (or to supervise) JFFS during run-time. - * - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/jffs.h> -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/pagemap.h> -#include <linux/mutex.h> -#include <asm/byteorder.h> -#include <linux/smp_lock.h> -#include <linux/time.h> -#include <linux/ctype.h> -#include <linux/freezer.h> - -#include "intrep.h" -#include "jffs_fm.h" - -long no_jffs_node = 0; -static long no_jffs_file = 0; -#if defined(JFFS_MEMORY_DEBUG) && JFFS_MEMORY_DEBUG -long no_jffs_control = 0; -long no_jffs_raw_inode = 0; -long no_jffs_node_ref = 0; -long no_jffs_fm = 0; -long no_jffs_fmcontrol = 0; -long no_hash = 0; -long no_name = 0; -#endif - -static int jffs_scan_flash(struct jffs_control *c); -static int jffs_update_file(struct jffs_file *f, struct jffs_node *node); -static int jffs_build_file(struct jffs_file *f); -static int jffs_free_file(struct jffs_file *f); -static int jffs_free_node_list(struct jffs_file *f); -static int jffs_garbage_collect_now(struct jffs_control *c); -static int jffs_insert_file_into_hash(struct jffs_file *f); -static int jffs_remove_redundant_nodes(struct jffs_file *f); - -/* Is there enough space on the flash? */ -static inline int JFFS_ENOUGH_SPACE(struct jffs_control *c, __u32 space) -{ - struct jffs_fmcontrol *fmc = c->fmc; - - while (1) { - if ((fmc->flash_size - (fmc->used_size + fmc->dirty_size)) - >= fmc->min_free_size + space) { - return 1; - } - if (fmc->dirty_size < fmc->sector_size) - return 0; - - if (jffs_garbage_collect_now(c)) { - D1(printk("JFFS_ENOUGH_SPACE: jffs_garbage_collect_now() failed.\n")); - return 0; - } - } -} - -#if CONFIG_JFFS_FS_VERBOSE > 0 -static __u8 -flash_read_u8(struct mtd_info *mtd, loff_t from) -{ - size_t retlen; - __u8 ret; - int res; - - res = MTD_READ(mtd, from, 1, &retlen, &ret); - if (retlen != 1) { - printk("Didn't read a byte in flash_read_u8(). Returned %d\n", res); - return 0; - } - - return ret; -} - -static void -jffs_hexdump(struct mtd_info *mtd, loff_t pos, int size) -{ - char line[16]; - int j = 0; - - while (size > 0) { - int i; - - printk("%ld:", (long) pos); - for (j = 0; j < 16; j++) { - line[j] = flash_read_u8(mtd, pos++); - } - for (i = 0; i < j; i++) { - if (!(i & 1)) { - printk(" %.2x", line[i] & 0xff); - } - else { - printk("%.2x", line[i] & 0xff); - } - } - - /* Print empty space */ - for (; i < 16; i++) { - if (!(i & 1)) { - printk(" "); - } - else { - printk(" "); - } - } - printk(" "); - - for (i = 0; i < j; i++) { - if (isgraph(line[i])) { - printk("%c", line[i]); - } - else { - printk("."); - } - } - printk("\n"); - size -= 16; - } -} - -/* Print the contents of a node. */ -static void -jffs_print_node(struct jffs_node *n) -{ - D(printk("jffs_node: 0x%p\n", n)); - D(printk("{\n")); - D(printk(" 0x%08x, /* version */\n", n->version)); - D(printk(" 0x%08x, /* data_offset */\n", n->data_offset)); - D(printk(" 0x%08x, /* data_size */\n", n->data_size)); - D(printk(" 0x%08x, /* removed_size */\n", n->removed_size)); - D(printk(" 0x%08x, /* fm_offset */\n", n->fm_offset)); - D(printk(" 0x%02x, /* name_size */\n", n->name_size)); - D(printk(" 0x%p, /* fm, fm->offset: %u */\n", - n->fm, (n->fm ? n->fm->offset : 0))); - D(printk(" 0x%p, /* version_prev */\n", n->version_prev)); - D(printk(" 0x%p, /* version_next */\n", n->version_next)); - D(printk(" 0x%p, /* range_prev */\n", n->range_prev)); - D(printk(" 0x%p, /* range_next */\n", n->range_next)); - D(printk("}\n")); -} - -#endif - -/* Print the contents of a raw inode. */ -static void -jffs_print_raw_inode(struct jffs_raw_inode *raw_inode) -{ - D(printk("jffs_raw_inode: inode number: %u\n", raw_inode->ino)); - D(printk("{\n")); - D(printk(" 0x%08x, /* magic */\n", raw_inode->magic)); - D(printk(" 0x%08x, /* ino */\n", raw_inode->ino)); - D(printk(" 0x%08x, /* pino */\n", raw_inode->pino)); - D(printk(" 0x%08x, /* version */\n", raw_inode->version)); - D(printk(" 0x%08x, /* mode */\n", raw_inode->mode)); - D(printk(" 0x%04x, /* uid */\n", raw_inode->uid)); - D(printk(" 0x%04x, /* gid */\n", raw_inode->gid)); - D(printk(" 0x%08x, /* atime */\n", raw_inode->atime)); - D(printk(" 0x%08x, /* mtime */\n", raw_inode->mtime)); - D(printk(" 0x%08x, /* ctime */\n", raw_inode->ctime)); - D(printk(" 0x%08x, /* offset */\n", raw_inode->offset)); - D(printk(" 0x%08x, /* dsize */\n", raw_inode->dsize)); - D(printk(" 0x%08x, /* rsize */\n", raw_inode->rsize)); - D(printk(" 0x%02x, /* nsize */\n", raw_inode->nsize)); - D(printk(" 0x%02x, /* nlink */\n", raw_inode->nlink)); - D(printk(" 0x%02x, /* spare */\n", - raw_inode->spare)); - D(printk(" %u, /* rename */\n", - raw_inode->rename)); - D(printk(" %u, /* deleted */\n", - raw_inode->deleted)); - D(printk(" 0x%02x, /* accurate */\n", - raw_inode->accurate)); - D(printk(" 0x%08x, /* dchksum */\n", raw_inode->dchksum)); - D(printk(" 0x%04x, /* nchksum */\n", raw_inode->nchksum)); - D(printk(" 0x%04x, /* chksum */\n", raw_inode->chksum)); - D(printk("}\n")); -} - -#define flash_safe_acquire(arg) -#define flash_safe_release(arg) - - -static int -flash_safe_read(struct mtd_info *mtd, loff_t from, - u_char *buf, size_t count) -{ - size_t retlen; - int res; - - D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n", - mtd, (unsigned int) from, buf, count)); - - res = mtd->read(mtd, from, count, &retlen, buf); - if (retlen != count) { - panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res); - } - return res?res:retlen; -} - - -static __u32 -flash_read_u32(struct mtd_info *mtd, loff_t from) -{ - size_t retlen; - __u32 ret; - int res; - - res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret); - if (retlen != 4) { - printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res); - return 0; - } - - return ret; -} - - -static int -flash_safe_write(struct mtd_info *mtd, loff_t to, - const u_char *buf, size_t count) -{ - size_t retlen; - int res; - - D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n", - mtd, (unsigned int) to, buf, count)); - - res = mtd->write(mtd, to, count, &retlen, buf); - if (retlen != count) { - printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res); - } - return res?res:retlen; -} - - -static int -flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs, - unsigned long iovec_cnt, loff_t to) -{ - size_t retlen, retlen_a; - int i; - int res; - - D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n", - mtd, (unsigned int) to, vecs)); - - if (mtd->writev) { - res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen); - return res ? res : retlen; - } - /* Not implemented writev. Repeatedly use write - on the not so - unreasonable assumption that the mtd driver doesn't care how - many write cycles we use. */ - res=0; - retlen=0; - - for (i=0; !res && i<iovec_cnt; i++) { - res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a, - vecs[i].iov_base); - if (retlen_a != vecs[i].iov_len) { - printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res); - if (i != iovec_cnt-1) - return -EIO; - } - /* If res is non-zero, retlen_a is undefined, but we don't - care because in that case it's not going to be - returned anyway. - */ - to += retlen_a; - retlen += retlen_a; - } - return res?res:retlen; -} - - -static int -flash_memset(struct mtd_info *mtd, loff_t to, - const u_char c, size_t size) -{ - static unsigned char pattern[64]; - int i; - - /* fill up pattern */ - - for(i = 0; i < 64; i++) - pattern[i] = c; - - /* write as many 64-byte chunks as we can */ - - while (size >= 64) { - flash_safe_write(mtd, to, pattern, 64); - size -= 64; - to += 64; - } - - /* and the rest */ - - if(size) - flash_safe_write(mtd, to, pattern, size); - - return size; -} - - -static void -intrep_erase_callback(struct erase_info *done) -{ - wait_queue_head_t *wait_q; - - wait_q = (wait_queue_head_t *)done->priv; - - wake_up(wait_q); -} - - -static int -flash_erase_region(struct mtd_info *mtd, loff_t start, - size_t size) -{ - struct erase_info *erase; - DECLARE_WAITQUEUE(wait, current); - wait_queue_head_t wait_q; - - erase = kmalloc(sizeof(struct erase_info), GFP_KERNEL); - if (!erase) - return -ENOMEM; - - init_waitqueue_head(&wait_q); - - erase->mtd = mtd; - erase->callback = intrep_erase_callback; - erase->addr = start; - erase->len = size; - erase->priv = (u_long)&wait_q; - - /* FIXME: Use TASK_INTERRUPTIBLE and deal with being interrupted */ - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&wait_q, &wait); - - if (mtd->erase(mtd, erase) < 0) { - set_current_state(TASK_RUNNING); - remove_wait_queue(&wait_q, &wait); - kfree(erase); - - printk(KERN_WARNING "flash: erase of region [0x%lx, 0x%lx] " - "totally failed\n", (long)start, (long)start + size); - - return -1; - } - - schedule(); /* Wait for flash to finish. */ - remove_wait_queue(&wait_q, &wait); - - kfree(erase); - - return 0; -} - -/* This routine calculates checksums in JFFS. */ -static __u32 -jffs_checksum(const void *data, int size) -{ - __u32 sum = 0; - __u8 *ptr = (__u8 *)data; - while (size-- > 0) { - sum += *ptr++; - } - D3(printk(", result: 0x%08x\n", sum)); - return sum; -} - - -static int -jffs_checksum_flash(struct mtd_info *mtd, loff_t start, int size, __u32 *result) -{ - __u32 sum = 0; - loff_t ptr = start; - __u8 *read_buf; - int i, length; - - /* Allocate read buffer */ - read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL); - if (!read_buf) { - printk(KERN_NOTICE "kmalloc failed in jffs_checksum_flash()\n"); - return -ENOMEM; - } - /* Loop until checksum done */ - while (size) { - /* Get amount of data to read */ - if (size < 4096) - length = size; - else - length = 4096; - - /* Perform flash read */ - D3(printk(KERN_NOTICE "jffs_checksum_flash\n")); - flash_safe_read(mtd, ptr, &read_buf[0], length); - - /* Compute checksum */ - for (i=0; i < length ; i++) - sum += read_buf[i]; - - /* Update pointer and size */ - size -= length; - ptr += length; - } - - /* Free read buffer */ - kfree(read_buf); - - /* Return result */ - D3(printk("checksum result: 0x%08x\n", sum)); - *result = sum; - return 0; -} - -static __inline__ void jffs_fm_write_lock(struct jffs_fmcontrol *fmc) -{ - // down(&fmc->wlock); -} - -static __inline__ void jffs_fm_write_unlock(struct jffs_fmcontrol *fmc) -{ - // up(&fmc->wlock); -} - - -/* Create and initialize a new struct jffs_file. */ -static struct jffs_file * -jffs_create_file(struct jffs_control *c, - const struct jffs_raw_inode *raw_inode) -{ - struct jffs_file *f; - - if (!(f = kzalloc(sizeof(*f), GFP_KERNEL))) { - D(printk("jffs_create_file(): Failed!\n")); - return NULL; - } - no_jffs_file++; - f->ino = raw_inode->ino; - f->pino = raw_inode->pino; - f->nlink = raw_inode->nlink; - f->deleted = raw_inode->deleted; - f->c = c; - - return f; -} - - -/* Build a control block for the file system. */ -static struct jffs_control * -jffs_create_control(struct super_block *sb) -{ - struct jffs_control *c; - register int s = sizeof(struct jffs_control); - int i; - D(char *t = 0); - - D2(printk("jffs_create_control()\n")); - - if (!(c = kmalloc(s, GFP_KERNEL))) { - goto fail_control; - } - DJM(no_jffs_control++); - c->root = NULL; - c->gc_task = NULL; - c->hash_len = JFFS_HASH_SIZE; - s = sizeof(struct list_head) * c->hash_len; - if (!(c->hash = kmalloc(s, GFP_KERNEL))) { - goto fail_hash; - } - DJM(no_hash++); - for (i = 0; i < c->hash_len; i++) - INIT_LIST_HEAD(&c->hash[i]); - if (!(c->fmc = jffs_build_begin(c, MINOR(sb->s_dev)))) { - goto fail_fminit; - } - c->next_ino = JFFS_MIN_INO + 1; - c->delete_list = (struct jffs_delete_list *) 0; - return c; - -fail_fminit: - D(t = "c->fmc"); -fail_hash: - kfree(c); - DJM(no_jffs_control--); - D(t = t ? t : "c->hash"); -fail_control: - D(t = t ? t : "control"); - D(printk("jffs_create_control(): Allocation failed: (%s)\n", t)); - return (struct jffs_control *)0; -} - - -/* Clean up all data structures associated with the file system. */ -void -jffs_cleanup_control(struct jffs_control *c) -{ - D2(printk("jffs_cleanup_control()\n")); - - if (!c) { - D(printk("jffs_cleanup_control(): c == NULL !!!\n")); - return; - } - - while (c->delete_list) { - struct jffs_delete_list *delete_list_element; - delete_list_element = c->delete_list; - c->delete_list = c->delete_list->next; - kfree(delete_list_element); - } - - /* Free all files and nodes. */ - if (c->hash) { - jffs_foreach_file(c, jffs_free_node_list); - jffs_foreach_file(c, jffs_free_file); - kfree(c->hash); - DJM(no_hash--); - } - jffs_cleanup_fmcontrol(c->fmc); - kfree(c); - DJM(no_jffs_control--); - D3(printk("jffs_cleanup_control(): Leaving...\n")); -} - - -/* This function adds a virtual root node to the in-RAM representation. - Called by jffs_build_fs(). */ -static int -jffs_add_virtual_root(struct jffs_control *c) -{ - struct jffs_file *root; - struct jffs_node *node; - - D2(printk("jffs_add_virtual_root(): " - "Creating a virtual root directory.\n")); - - if (!(root = kzalloc(sizeof(struct jffs_file), GFP_KERNEL))) { - return -ENOMEM; - } - no_jffs_file++; - if (!(node = jffs_alloc_node())) { - kfree(root); - no_jffs_file--; - return -ENOMEM; - } - DJM(no_jffs_node++); - memset(node, 0, sizeof(struct jffs_node)); - node->ino = JFFS_MIN_INO; - root->ino = JFFS_MIN_INO; - root->mode = S_IFDIR | S_IRWXU | S_IRGRP - | S_IXGRP | S_IROTH | S_IXOTH; - root->atime = root->mtime = root->ctime = get_seconds(); - root->nlink = 1; - root->c = c; - root->version_head = root->version_tail = node; - jffs_insert_file_into_hash(root); - return 0; -} - - -/* This is where the file system is built and initialized. */ -int -jffs_build_fs(struct super_block *sb) -{ - struct jffs_control *c; - int err = 0; - - D2(printk("jffs_build_fs()\n")); - - if (!(c = jffs_create_control(sb))) { - return -ENOMEM; - } - c->building_fs = 1; - c->sb = sb; - if ((err = jffs_scan_flash(c)) < 0) { - if(err == -EAGAIN){ - /* scan_flash() wants us to try once more. A flipping - bits sector was detect in the middle of the scan flash. - Clean up old allocated memory before going in. - */ - D1(printk("jffs_build_fs: Cleaning up all control structures," - " reallocating them and trying mount again.\n")); - jffs_cleanup_control(c); - if (!(c = jffs_create_control(sb))) { - return -ENOMEM; - } - c->building_fs = 1; - c->sb = sb; - - if ((err = jffs_scan_flash(c)) < 0) { - goto jffs_build_fs_fail; - } - }else{ - goto jffs_build_fs_fail; - } - } - - /* Add a virtual root node if no one exists. */ - if (!jffs_find_file(c, JFFS_MIN_INO)) { - if ((err = jffs_add_virtual_root(c)) < 0) { - goto jffs_build_fs_fail; - } - } - - while (c->delete_list) { - struct jffs_file *f; - struct jffs_delete_list *delete_list_element; - - if ((f = jffs_find_file(c, c->delete_list->ino))) { - f->deleted = 1; - } - delete_list_element = c->delete_list; - c->delete_list = c->delete_list->next; - kfree(delete_list_element); - } - - /* Remove deleted nodes. */ - if ((err = jffs_foreach_file(c, jffs_possibly_delete_file)) < 0) { - printk(KERN_ERR "JFFS: Failed to remove deleted nodes.\n"); - goto jffs_build_fs_fail; - } - /* Remove redundant nodes. (We are not interested in the - return value in this case.) */ - jffs_foreach_file(c, jffs_remove_redundant_nodes); - /* Try to build a tree from all the nodes. */ - if ((err = jffs_foreach_file(c, jffs_insert_file_into_tree)) < 0) { - printk("JFFS: Failed to build tree.\n"); - goto jffs_build_fs_fail; - } - /* Compute the sizes of all files in the filesystem. Adjust if - necessary. */ - if ((err = jffs_foreach_file(c, jffs_build_file)) < 0) { - printk("JFFS: Failed to build file system.\n"); - goto jffs_build_fs_fail; - } - sb->s_fs_info = (void *)c; - c->building_fs = 0; - - D1(jffs_print_hash_table(c)); - D1(jffs_print_tree(c->root, 0)); - - return 0; - -jffs_build_fs_fail: - jffs_cleanup_control(c); - return err; -} /* jffs_build_fs() */ - - -/* - This checks for sectors that were being erased in their previous - lifetimes and for some reason or the other (power fail etc.), - the erase cycles never completed. - As the flash array would have reverted back to read status, - these sectors are detected by the symptom of the "flipping bits", - i.e. bits being read back differently from the same location in - flash if read multiple times. - The only solution to this is to re-erase the entire - sector. - Unfortunately detecting "flipping bits" is not a simple exercise - as a bit may be read back at 1 or 0 depending on the alignment - of the stars in the universe. - The level of confidence is in direct proportion to the number of - scans done. By power fail testing I (Vipin) have been able to - proove that reading twice is not enough. - Maybe 4 times? Change NUM_REREADS to a higher number if you want - a (even) higher degree of confidence in your mount process. - A higher number would of course slow down your mount. -*/ -static int check_partly_erased_sectors(struct jffs_fmcontrol *fmc){ - -#define NUM_REREADS 4 /* see note above */ -#define READ_AHEAD_BYTES 4096 /* must be a multiple of 4, - usually set to kernel page size */ - - __u8 *read_buf1; - __u8 *read_buf2; - - int err = 0; - int retlen; - int i; - int cnt; - __u32 offset; - loff_t pos = 0; - loff_t end = fmc->flash_size; - - - /* Allocate read buffers */ - read_buf1 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); - if (!read_buf1) - return -ENOMEM; - - read_buf2 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL); - if (!read_buf2) { - kfree(read_buf1); - return -ENOMEM; - } - - CHECK_NEXT: - while(pos < end){ - - D1(printk("check_partly_erased_sector():checking sector which contains" - " offset 0x%x for flipping bits..\n", (__u32)pos)); - - retlen = flash_safe_read(fmc->mtd, pos, - &read_buf1[0], READ_AHEAD_BYTES); - retlen &= ~3; - - for(cnt = 0; cnt < NUM_REREADS; cnt++){ - (void)flash_safe_read(fmc->mtd, pos, - &read_buf2[0], READ_AHEAD_BYTES); - - for (i=0 ; i < retlen ; i+=4) { - /* buffers MUST match, double word for word! */ - if(*((__u32 *) &read_buf1[i]) != - *((__u32 *) &read_buf2[i]) - ){ - /* flipping bits detected, time to erase sector */ - /* This will help us log some statistics etc. */ - D1(printk("Flipping bits detected in re-read round:%i of %i\n", - cnt, NUM_REREADS)); - D1(printk("check_partly_erased_sectors:flipping bits detected" - " @offset:0x%x(0x%x!=0x%x)\n", - (__u32)pos+i, *((__u32 *) &read_buf1[i]), - *((__u32 *) &read_buf2[i]))); - - /* calculate start of present sector */ - offset = (((__u32)pos+i)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size; - - D1(printk("check_partly_erased_sector():erasing sector starting 0x%x.\n", - offset)); - - if (flash_erase_region(fmc->mtd, - offset, fmc->sector_size) < 0) { - printk(KERN_ERR "JFFS: Erase of flash failed. " - "offset = %u, erase_size = %d\n", - offset , fmc->sector_size); - - err = -EIO; - goto returnBack; - - }else{ - D1(printk("JFFS: Erase of flash sector @0x%x successful.\n", - offset)); - /* skip ahead to the next sector */ - pos = (((__u32)pos+i)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size; - pos += fmc->sector_size; - goto CHECK_NEXT; - } - } - } - } - pos += READ_AHEAD_BYTES; - } - - returnBack: - kfree(read_buf1); - kfree(read_buf2); - - D2(printk("check_partly_erased_sector():Done checking all sectors till offset 0x%x for flipping bits.\n", - (__u32)pos)); - - return err; - -}/* end check_partly_erased_sectors() */ - - - -/* Scan the whole flash memory in order to find all nodes in the - file systems. */ -static int -jffs_scan_flash(struct jffs_control *c) -{ - char name[JFFS_MAX_NAME_LEN + 2]; - struct jffs_raw_inode raw_inode; - struct jffs_node *node = NULL; - struct jffs_fmcontrol *fmc = c->fmc; - __u32 checksum; - __u8 tmp_accurate; - __u16 tmp_chksum; - __u32 deleted_file; - loff_t pos = 0; - loff_t start; - loff_t test_start; - loff_t end = fmc->flash_size; - __u8 *read_buf; - int i, len, retlen; - __u32 offset; - - __u32 free_chunk_size1; - __u32 free_chunk_size2; - - -#define NUMFREEALLOWED 2 /* 2 chunks of at least erase size space allowed */ - int num_free_space = 0; /* Flag err if more than TWO - free blocks found. This is NOT allowed - by the current jffs design. - */ - int num_free_spc_not_accp = 0; /* For debugging purposed keep count - of how much free space was rejected and - marked dirty - */ - - D1(printk("jffs_scan_flash(): start pos = 0x%lx, end = 0x%lx\n", - (long)pos, (long)end)); - - flash_safe_acquire(fmc->mtd); - - /* - check and make sure that any sector does not suffer - from the "partly erased, bit flipping syndrome" (TM Vipin :) - If so, offending sectors will be erased. - */ - if(check_partly_erased_sectors(fmc) < 0){ - - flash_safe_release(fmc->mtd); - return -EIO; /* bad, bad, bad error. Cannot continue.*/ - } - - /* Allocate read buffer */ - read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL); - if (!read_buf) { - flash_safe_release(fmc->mtd); - return -ENOMEM; - } - - /* Start the scan. */ - while (pos < end) { - deleted_file = 0; - - /* Remember the position from where we started this scan. */ - start = pos; - - switch (flash_read_u32(fmc->mtd, pos)) { - case JFFS_EMPTY_BITMASK: - /* We have found 0xffffffff at this position. We have to - scan the rest of the flash till the end or till - something else than 0xffffffff is found. - Keep going till we do not find JFFS_EMPTY_BITMASK - anymore */ - - D1(printk("jffs_scan_flash(): 0xffffffff at pos 0x%lx.\n", - (long)pos)); - - while(pos < end){ - - len = end - pos < 4096 ? end - pos : 4096; - - retlen = flash_safe_read(fmc->mtd, pos, - &read_buf[0], len); - - retlen &= ~3; - - for (i=0 ; i < retlen ; i+=4, pos += 4) { - if(*((__u32 *) &read_buf[i]) != - JFFS_EMPTY_BITMASK) - break; - } - if (i == retlen) - continue; - else - break; - } - - D1(printk("jffs_scan_flash():0xffffffff ended at pos 0x%lx.\n", - (long)pos)); - - /* If some free space ends in the middle of a sector, - treat it as dirty rather than clean. - This is to handle the case where one thread - allocated space for a node, but didn't get to - actually _write_ it before power was lost, leaving - a gap in the log. Shifting all node writes into - a single kernel thread will fix the original problem. - */ - if ((__u32) pos % fmc->sector_size) { - /* If there was free space in previous - sectors, don't mark that dirty too - - only from the beginning of this sector - (or from start) - */ - - test_start = pos & ~(fmc->sector_size-1); /* end of last sector */ - - if (start < test_start) { - - /* free space started in the previous sector! */ - - if((num_free_space < NUMFREEALLOWED) && - ((unsigned int)(test_start - start) >= fmc->sector_size)){ - - /* - Count it in if we are still under NUMFREEALLOWED *and* it is - at least 1 erase sector in length. This will keep us from - picking any little ole' space as "free". - */ - - D1(printk("Reducing end of free space to 0x%x from 0x%x\n", - (unsigned int)test_start, (unsigned int)pos)); - - D1(printk("Free space accepted: Starting 0x%x for 0x%x bytes\n", - (unsigned int) start, - (unsigned int)(test_start - start))); - - /* below, space from "start" to "pos" will be marked dirty. */ - start = test_start; - - /* Being in here means that we have found at least an entire - erase sector size of free space ending on a sector boundary. - Keep track of free spaces accepted. - */ - num_free_space++; - }else{ - num_free_spc_not_accp++; - D1(printk("Free space (#%i) found but *Not* accepted: Starting" - " 0x%x for 0x%x bytes\n", - num_free_spc_not_accp, (unsigned int)start, - (unsigned int)((unsigned int)(pos & ~(fmc->sector_size-1)) - (unsigned int)start))); - - } - - } - if((((__u32)(pos - start)) != 0)){ - - D1(printk("Dirty space: Starting 0x%x for 0x%x bytes\n", - (unsigned int) start, (unsigned int) (pos - start))); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - }else{ - /* "Flipping bits" detected. This means that our scan for them - did not catch this offset. See check_partly_erased_sectors() for - more info. - */ - - D1(printk("jffs_scan_flash():wants to allocate dirty flash " - "space for 0 bytes.\n")); - D1(printk("jffs_scan_flash(): Flipping bits! We will free " - "all allocated memory, erase this sector and remount\n")); - - /* calculate start of present sector */ - offset = (((__u32)pos)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size; - - D1(printk("jffs_scan_flash():erasing sector starting 0x%x.\n", - offset)); - - if (flash_erase_region(fmc->mtd, - offset, fmc->sector_size) < 0) { - printk(KERN_ERR "JFFS: Erase of flash failed. " - "offset = %u, erase_size = %d\n", - offset , fmc->sector_size); - - flash_safe_release(fmc->mtd); - kfree(read_buf); - return -1; /* bad, bad, bad! */ - - } - flash_safe_release(fmc->mtd); - kfree(read_buf); - - return -EAGAIN; /* erased offending sector. Try mount one more time please. */ - } - }else{ - /* Being in here means that we have found free space that ends on an erase sector - boundary. - Count it in if we are still under NUMFREEALLOWED *and* it is at least 1 erase - sector in length. This will keep us from picking any little ole' space as "free". - */ - if((num_free_space < NUMFREEALLOWED) && - ((unsigned int)(pos - start) >= fmc->sector_size)){ - /* We really don't do anything to mark space as free, except *not* - mark it dirty and just advance the "pos" location pointer. - It will automatically be picked up as free space. - */ - num_free_space++; - D1(printk("Free space accepted: Starting 0x%x for 0x%x bytes\n", - (unsigned int) start, (unsigned int) (pos - start))); - }else{ - num_free_spc_not_accp++; - D1(printk("Free space (#%i) found but *Not* accepted: Starting " - "0x%x for 0x%x bytes\n", num_free_spc_not_accp, - (unsigned int) start, - (unsigned int) (pos - start))); - - /* Mark this space as dirty. We already have our free space. */ - D1(printk("Dirty space: Starting 0x%x for 0x%x bytes\n", - (unsigned int) start, (unsigned int) (pos - start))); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - } - - } - if(num_free_space > NUMFREEALLOWED){ - printk(KERN_WARNING "jffs_scan_flash(): Found free space " - "number %i. Only %i free space is allowed.\n", - num_free_space, NUMFREEALLOWED); - } - continue; - - case JFFS_DIRTY_BITMASK: - /* We have found 0x00000000 at this position. Scan as far - as possible to find out how much is dirty. */ - D1(printk("jffs_scan_flash(): 0x00000000 at pos 0x%lx.\n", - (long)pos)); - for (; pos < end - && JFFS_DIRTY_BITMASK == flash_read_u32(fmc->mtd, pos); - pos += 4); - D1(printk("jffs_scan_flash(): 0x00 ended at " - "pos 0x%lx.\n", (long)pos)); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - continue; - - case JFFS_MAGIC_BITMASK: - /* We have probably found a new raw inode. */ - break; - - default: - bad_inode: - /* We're f*cked. This is not solved yet. We have - to scan for the magic pattern. */ - D1(printk("*************** Dirty flash memory or " - "bad inode: " - "hexdump(pos = 0x%lx, len = 128):\n", - (long)pos)); - D1(jffs_hexdump(fmc->mtd, pos, 128)); - - for (pos += 4; pos < end; pos += 4) { - switch (flash_read_u32(fmc->mtd, pos)) { - case JFFS_MAGIC_BITMASK: - case JFFS_EMPTY_BITMASK: - /* handle these in the main switch() loop */ - goto cont_scan; - - default: - break; - } - } - - cont_scan: - /* First, mark as dirty the region - which really does contain crap. */ - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), - NULL); - - continue; - }/* switch */ - - /* We have found the beginning of an inode. Create a - node for it unless there already is one available. */ - if (!node) { - if (!(node = jffs_alloc_node())) { - /* Free read buffer */ - kfree(read_buf); - - /* Release the flash device */ - flash_safe_release(fmc->mtd); - - return -ENOMEM; - } - DJM(no_jffs_node++); - } - - /* Read the next raw inode. */ - - flash_safe_read(fmc->mtd, pos, (u_char *) &raw_inode, - sizeof(struct jffs_raw_inode)); - - /* When we compute the checksum for the inode, we never - count the 'accurate' or the 'checksum' fields. */ - tmp_accurate = raw_inode.accurate; - tmp_chksum = raw_inode.chksum; - raw_inode.accurate = 0; - raw_inode.chksum = 0; - checksum = jffs_checksum(&raw_inode, - sizeof(struct jffs_raw_inode)); - raw_inode.accurate = tmp_accurate; - raw_inode.chksum = tmp_chksum; - - D3(printk("*** We have found this raw inode at pos 0x%lx " - "on the flash:\n", (long)pos)); - D3(jffs_print_raw_inode(&raw_inode)); - - if (checksum != raw_inode.chksum) { - D1(printk("jffs_scan_flash(): Bad checksum: " - "checksum = %u, " - "raw_inode.chksum = %u\n", - checksum, raw_inode.chksum)); - pos += sizeof(struct jffs_raw_inode); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - /* Reuse this unused struct jffs_node. */ - continue; - } - - /* Check the raw inode read so far. Start with the - maximum length of the filename. */ - if (raw_inode.nsize > JFFS_MAX_NAME_LEN) { - printk(KERN_WARNING "jffs_scan_flash: Found a " - "JFFS node with name too large\n"); - goto bad_inode; - } - - if (raw_inode.rename && raw_inode.dsize != sizeof(__u32)) { - printk(KERN_WARNING "jffs_scan_flash: Found a " - "rename node with dsize %u.\n", - raw_inode.dsize); - jffs_print_raw_inode(&raw_inode); - goto bad_inode; - } - - /* The node's data segment should not exceed a - certain length. */ - if (raw_inode.dsize > fmc->max_chunk_size) { - printk(KERN_WARNING "jffs_scan_flash: Found a " - "JFFS node with dsize (0x%x) > max_chunk_size (0x%x)\n", - raw_inode.dsize, fmc->max_chunk_size); - goto bad_inode; - } - - pos += sizeof(struct jffs_raw_inode); - - /* This shouldn't be necessary because a node that - violates the flash boundaries shouldn't be written - in the first place. */ - if (pos >= end) { - goto check_node; - } - - /* Read the name. */ - *name = 0; - if (raw_inode.nsize) { - flash_safe_read(fmc->mtd, pos, name, raw_inode.nsize); - name[raw_inode.nsize] = '\0'; - pos += raw_inode.nsize - + JFFS_GET_PAD_BYTES(raw_inode.nsize); - D3(printk("name == \"%s\"\n", name)); - checksum = jffs_checksum(name, raw_inode.nsize); - if (checksum != raw_inode.nchksum) { - D1(printk("jffs_scan_flash(): Bad checksum: " - "checksum = %u, " - "raw_inode.nchksum = %u\n", - checksum, raw_inode.nchksum)); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - /* Reuse this unused struct jffs_node. */ - continue; - } - if (pos >= end) { - goto check_node; - } - } - - /* Read the data, if it exists, in order to be sure it - matches the checksum. */ - if (raw_inode.dsize) { - if (raw_inode.rename) { - deleted_file = flash_read_u32(fmc->mtd, pos); - } - if (jffs_checksum_flash(fmc->mtd, pos, raw_inode.dsize, &checksum)) { - printk("jffs_checksum_flash() failed to calculate a checksum\n"); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - /* Reuse this unused struct jffs_node. */ - continue; - } - pos += raw_inode.dsize - + JFFS_GET_PAD_BYTES(raw_inode.dsize); - - if (checksum != raw_inode.dchksum) { - D1(printk("jffs_scan_flash(): Bad checksum: " - "checksum = %u, " - "raw_inode.dchksum = %u\n", - checksum, raw_inode.dchksum)); - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - /* Reuse this unused struct jffs_node. */ - continue; - } - } - - check_node: - - /* Remember the highest inode number in the whole file - system. This information will be used when assigning - new files new inode numbers. */ - if (c->next_ino <= raw_inode.ino) { - c->next_ino = raw_inode.ino + 1; - } - - if (raw_inode.accurate) { - int err; - node->data_offset = raw_inode.offset; - node->data_size = raw_inode.dsize; - node->removed_size = raw_inode.rsize; - /* Compute the offset to the actual data in the - on-flash node. */ - node->fm_offset - = sizeof(struct jffs_raw_inode) - + raw_inode.nsize - + JFFS_GET_PAD_BYTES(raw_inode.nsize); - node->fm = jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), - node); - if (!node->fm) { - D(printk("jffs_scan_flash(): !node->fm\n")); - jffs_free_node(node); - DJM(no_jffs_node--); - - /* Free read buffer */ - kfree(read_buf); - - /* Release the flash device */ - flash_safe_release(fmc->mtd); - - return -ENOMEM; - } - if ((err = jffs_insert_node(c, NULL, &raw_inode, - name, node)) < 0) { - printk("JFFS: Failed to handle raw inode. " - "(err = %d)\n", err); - break; - } - if (raw_inode.rename) { - struct jffs_delete_list *dl - = (struct jffs_delete_list *) - kmalloc(sizeof(struct jffs_delete_list), - GFP_KERNEL); - if (!dl) { - D(printk("jffs_scan_flash: !dl\n")); - jffs_free_node(node); - DJM(no_jffs_node--); - - /* Release the flash device */ - flash_safe_release(fmc->flash_part); - - /* Free read buffer */ - kfree(read_buf); - - return -ENOMEM; - } - dl->ino = deleted_file; - dl->next = c->delete_list; - c->delete_list = dl; - node->data_size = 0; - } - D3(jffs_print_node(node)); - node = NULL; /* Don't free the node! */ - } - else { - jffs_fmalloced(fmc, (__u32) start, - (__u32) (pos - start), NULL); - D3(printk("jffs_scan_flash(): Just found an obsolete " - "raw_inode. Continuing the scan...\n")); - /* Reuse this unused struct jffs_node. */ - } - } - - if (node) { - jffs_free_node(node); - DJM(no_jffs_node--); - } - jffs_build_end(fmc); - - /* Free read buffer */ - kfree(read_buf); - - if(!num_free_space){ - printk(KERN_WARNING "jffs_scan_flash(): Did not find even a single " - "chunk of free space. This is BAD!\n"); - } - - /* Return happy */ - D3(printk("jffs_scan_flash(): Leaving...\n")); - flash_safe_release(fmc->mtd); - - /* This is to trap the "free size accounting screwed error. */ - free_chunk_size1 = jffs_free_size1(fmc); - free_chunk_size2 = jffs_free_size2(fmc); - - if (free_chunk_size1 + free_chunk_size2 != fmc->free_size) { - - printk(KERN_WARNING "jffs_scan_falsh():Free size accounting screwed\n"); - printk(KERN_WARNING "jfffs_scan_flash():free_chunk_size1 == 0x%x, " - "free_chunk_size2 == 0x%x, fmc->free_size == 0x%x\n", - free_chunk_size1, free_chunk_size2, fmc->free_size); - - return -1; /* Do NOT mount f/s so that we can inspect what happened. - Mounting this screwed up f/s will screw us up anyway. - */ - } - - return 0; /* as far as we are concerned, we are happy! */ -} /* jffs_scan_flash() */ - - -/* Insert any kind of node into the file system. Take care of data - insertions and deletions. Also remove redundant information. The - memory allocated for the `name' is regarded as "given away" in the - caller's perspective. */ -int -jffs_insert_node(struct jffs_control *c, struct jffs_file *f, - const struct jffs_raw_inode *raw_inode, - const char *name, struct jffs_node *node) -{ - int update_name = 0; - int insert_into_tree = 0; - - D2(printk("jffs_insert_node(): ino = %u, version = %u, " - "name = \"%s\", deleted = %d\n", - raw_inode->ino, raw_inode->version, - ((name && *name) ? name : ""), raw_inode->deleted)); - - /* If there doesn't exist an associated jffs_file, then - create, initialize and insert one into the file system. */ - if (!f && !(f = jffs_find_file(c, raw_inode->ino))) { - if (!(f = jffs_create_file(c, raw_inode))) { - return -ENOMEM; - } - jffs_insert_file_into_hash(f); - insert_into_tree = 1; - } - node->ino = raw_inode->ino; - node->version = raw_inode->version; - node->data_size = raw_inode->dsize; - node->fm_offset = sizeof(struct jffs_raw_inode) + raw_inode->nsize - + JFFS_GET_PAD_BYTES(raw_inode->nsize); - node->name_size = raw_inode->nsize; - - /* Now insert the node at the correct position into the file's - version list. */ - if (!f->version_head) { - /* This is the first node. */ - f->version_head = node; - f->version_tail = node; - node->version_prev = NULL; - node->version_next = NULL; - f->highest_version = node->version; - update_name = 1; - f->mode = raw_inode->mode; - f->uid = raw_inode->uid; - f->gid = raw_inode->gid; - f->atime = raw_inode->atime; - f->mtime = raw_inode->mtime; - f->ctime = raw_inode->ctime; - } - else if ((f->highest_version < node->version) - || (node->version == 0)) { - /* Insert at the end of the list. I.e. this node is the - newest one so far. */ - node->version_prev = f->version_tail; - node->version_next = NULL; - f->version_tail->version_next = node; - f->version_tail = node; - f->highest_version = node->version; - update_name = 1; - f->pino = raw_inode->pino; - f->mode = raw_inode->mode; - f->uid = raw_inode->uid; - f->gid = raw_inode->gid; - f->atime = raw_inode->atime; - f->mtime = raw_inode->mtime; - f->ctime = raw_inode->ctime; - } - else if (f->version_head->version > node->version) { - /* Insert at the bottom of the list. */ - node->version_prev = NULL; - node->version_next = f->version_head; - f->version_head->version_prev = node; - f->version_head = node; - if (!f->name) { - update_name = 1; - } - } - else { - struct jffs_node *n; - int newer_name = 0; - /* Search for the insertion position starting from - the tail (newest node). */ - for (n = f->version_tail; n; n = n->version_prev) { - if (n->version < node->version) { - node->version_prev = n; - node->version_next = n->version_next; - node->version_next->version_prev = node; - n->version_next = node; - if (!newer_name) { - update_name = 1; - } - break; - } - if (n->name_size) { - newer_name = 1; - } - } - } - - /* Deletion is irreversible. If any 'deleted' node is ever - written, the file is deleted */ - if (raw_inode->deleted) - f->deleted = raw_inode->deleted; - - /* Perhaps update the name. */ - if (raw_inode->nsize && update_name && name && *name && (name != f->name)) { - if (f->name) { - kfree(f->name); - DJM(no_name--); - } - if (!(f->name = kmalloc(raw_inode->nsize + 1, - GFP_KERNEL))) { - return -ENOMEM; - } - DJM(no_name++); - memcpy(f->name, name, raw_inode->nsize); - f->name[raw_inode->nsize] = '\0'; - f->nsize = raw_inode->nsize; - D3(printk("jffs_insert_node(): Updated the name of " - "the file to \"%s\".\n", name)); - } - - if (!c->building_fs) { - D3(printk("jffs_insert_node(): ---------------------------" - "------------------------------------------- 1\n")); - if (insert_into_tree) { - jffs_insert_file_into_tree(f); - } - /* Once upon a time, we would call jffs_possibly_delete_file() - here. That causes an oops if someone's still got the file - open, so now we only do it in jffs_delete_inode() - -- dwmw2 - */ - if (node->data_size || node->removed_size) { - jffs_update_file(f, node); - } - jffs_remove_redundant_nodes(f); - - jffs_garbage_collect_trigger(c); - - D3(printk("jffs_insert_node(): ---------------------------" - "------------------------------------------- 2\n")); - } - - return 0; -} /* jffs_insert_node() */ - - -/* Unlink a jffs_node from the version list it is in. */ -static inline void -jffs_unlink_node_from_version_list(struct jffs_file *f, - struct jffs_node *node) -{ - if (node->version_prev) { - node->version_prev->version_next = node->version_next; - } else { - f->version_head = node->version_next; - } - if (node->version_next) { - node->version_next->version_prev = node->version_prev; - } else { - f->version_tail = node->version_prev; - } -} - - -/* Unlink a jffs_node from the range list it is in. */ -static inline void -jffs_unlink_node_from_range_list(struct jffs_file *f, struct jffs_node *node) -{ - if (node->range_prev) { - node->range_prev->range_next = node->range_next; - } - else { - f->range_head = node->range_next; - } - if (node->range_next) { - node->range_next->range_prev = node->range_prev; - } - else { - f->range_tail = node->range_prev; - } -} - - -/* Function used by jffs_remove_redundant_nodes() below. This function - classifies what kind of information a node adds to a file. */ -static inline __u8 -jffs_classify_node(struct jffs_node *node) -{ - __u8 mod_type = JFFS_MODIFY_INODE; - - if (node->name_size) { - mod_type |= JFFS_MODIFY_NAME; - } - if (node->data_size || node->removed_size) { - mod_type |= JFFS_MODIFY_DATA; - } - return mod_type; -} - - -/* Remove redundant nodes from a file. Mark the on-flash memory - as dirty. */ -static int -jffs_remove_redundant_nodes(struct jffs_file *f) -{ - struct jffs_node *newest_node; - struct jffs_node *cur; - struct jffs_node *prev; - __u8 newest_type; - __u8 mod_type; - __u8 node_with_name_later = 0; - - if (!(newest_node = f->version_tail)) { - return 0; - } - - /* What does the `newest_node' modify? */ - newest_type = jffs_classify_node(newest_node); - node_with_name_later = newest_type & JFFS_MODIFY_NAME; - - D3(printk("jffs_remove_redundant_nodes(): ino: %u, name: \"%s\", " - "newest_type: %u\n", f->ino, (f->name ? f->name : ""), - newest_type)); - - /* Traverse the file's nodes and determine which of them that are - superfluous. Yeah, this might look very complex at first - glance but it is actually very simple. */ - for (cur = newest_node->version_prev; cur; cur = prev) { - prev = cur->version_prev; - mod_type = jffs_classify_node(cur); - if ((mod_type <= JFFS_MODIFY_INODE) - || ((newest_type & JFFS_MODIFY_NAME) - && (mod_type - <= (JFFS_MODIFY_INODE + JFFS_MODIFY_NAME))) - || (cur->data_size == 0 && cur->removed_size - && !cur->version_prev && node_with_name_later)) { - /* Yes, this node is redundant. Remove it. */ - D2(printk("jffs_remove_redundant_nodes(): " - "Removing node: ino: %u, version: %u, " - "mod_type: %u\n", cur->ino, cur->version, - mod_type)); - jffs_unlink_node_from_version_list(f, cur); - jffs_fmfree(f->c->fmc, cur->fm, cur); - jffs_free_node(cur); - DJM(no_jffs_node--); - } - else { - node_with_name_later |= (mod_type & JFFS_MODIFY_NAME); - } - } - - return 0; -} - - -/* Insert a file into the hash table. */ -static int -jffs_insert_file_into_hash(struct jffs_file *f) -{ - int i = f->ino % f->c->hash_len; - - D3(printk("jffs_insert_file_into_hash(): f->ino: %u\n", f->ino)); - - list_add(&f->hash, &f->c->hash[i]); - return 0; -} - - -/* Insert a file into the file system tree. */ -int -jffs_insert_file_into_tree(struct jffs_file *f) -{ - struct jffs_file *parent; - - D3(printk("jffs_insert_file_into_tree(): name: \"%s\"\n", - (f->name ? f->name : ""))); - - if (!(parent = jffs_find_file(f->c, f->pino))) { - if (f->pino == 0) { - f->c->root = f; - f->parent = NULL; - f->sibling_prev = NULL; - f->sibling_next = NULL; - return 0; - } - else { - D1(printk("jffs_insert_file_into_tree(): Found " - "inode with no parent and pino == %u\n", - f->pino)); - return -1; - } - } - f->parent = parent; - f->sibling_next = parent->children; - if (f->sibling_next) { - f->sibling_next->sibling_prev = f; - } - f->sibling_prev = NULL; - parent->children = f; - return 0; -} - - -/* Remove a file from the hash table. */ -static int -jffs_unlink_file_from_hash(struct jffs_file *f) -{ - D3(printk("jffs_unlink_file_from_hash(): f: 0x%p, " - "ino %u\n", f, f->ino)); - - list_del(&f->hash); - return 0; -} - - -/* Just remove the file from the parent's children. Don't free - any memory. */ -int -jffs_unlink_file_from_tree(struct jffs_file *f) -{ - D3(printk("jffs_unlink_file_from_tree(): ino: %d, pino: %d, name: " - "\"%s\"\n", f->ino, f->pino, (f->name ? f->name : ""))); - - if (f->sibling_prev) { - f->sibling_prev->sibling_next = f->sibling_next; - } - else if (f->parent) { - D3(printk("f->parent=%p\n", f->parent)); - f->parent->children = f->sibling_next; - } - if (f->sibling_next) { - f->sibling_next->sibling_prev = f->sibling_prev; - } - return 0; -} - - -/* Find a file with its inode number. */ -struct jffs_file * -jffs_find_file(struct jffs_control *c, __u32 ino) -{ - struct jffs_file *f; - int i = ino % c->hash_len; - - D3(printk("jffs_find_file(): ino: %u\n", ino)); - - list_for_each_entry(f, &c->hash[i], hash) { - if (ino != f->ino) - continue; - D3(printk("jffs_find_file(): Found file with ino " - "%u. (name: \"%s\")\n", - ino, (f->name ? f->name : "")); - ); - return f; - } - D3(printk("jffs_find_file(): Didn't find file " - "with ino %u.\n", ino); - ); - return NULL; -} - - -/* Find a file in a directory. We are comparing the names. */ -struct jffs_file * -jffs_find_child(struct jffs_file *dir, const char *name, int len) -{ - struct jffs_file *f; - - D3(printk("jffs_find_child()\n")); - - for (f = dir->children; f; f = f->sibling_next) { - if (!f->deleted && f->name - && !strncmp(f->name, name, len) - && f->name[len] == '\0') { - break; - } - } - - D3(if (f) { - printk("jffs_find_child(): Found \"%s\".\n", f->name); - } - else { - char *copy = kmalloc(len + 1, GFP_KERNEL); - if (copy) { - memcpy(copy, name, len); - copy[len] = '\0'; - } - printk("jffs_find_child(): Didn't find the file \"%s\".\n", - (copy ? copy : "")); - kfree(copy); - }); - - return f; -} - - -/* Write a raw inode that takes up a certain amount of space in the flash - memory. At the end of the flash device, there is often space that is - impossible to use. At these times we want to mark this space as not - used. In the cases when the amount of space is greater or equal than - a struct jffs_raw_inode, we write a "dummy node" that takes up this - space. The space after the raw inode, if it exists, is left as it is. - Since this space after the raw inode contains JFFS_EMPTY_BITMASK bytes, - we can compute the checksum of it; we don't have to manipulate it any - further. - - If the space left on the device is less than the size of a struct - jffs_raw_inode, this space is filled with JFFS_DIRTY_BITMASK bytes. - No raw inode is written this time. */ -static int -jffs_write_dummy_node(struct jffs_control *c, struct jffs_fm *dirty_fm) -{ - struct jffs_fmcontrol *fmc = c->fmc; - int err; - - D1(printk("jffs_write_dummy_node(): dirty_fm->offset = 0x%08x, " - "dirty_fm->size = %u\n", - dirty_fm->offset, dirty_fm->size)); - - if (dirty_fm->size >= sizeof(struct jffs_raw_inode)) { - struct jffs_raw_inode raw_inode; - memset(&raw_inode, 0, sizeof(struct jffs_raw_inode)); - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.dsize = dirty_fm->size - - sizeof(struct jffs_raw_inode); - raw_inode.dchksum = raw_inode.dsize * 0xff; - raw_inode.chksum - = jffs_checksum(&raw_inode, sizeof(struct jffs_raw_inode)); - - if ((err = flash_safe_write(fmc->mtd, - dirty_fm->offset, - (u_char *)&raw_inode, - sizeof(struct jffs_raw_inode))) - < 0) { - printk(KERN_ERR "JFFS: jffs_write_dummy_node: " - "flash_safe_write failed!\n"); - return err; - } - } - else { - flash_safe_acquire(fmc->mtd); - flash_memset(fmc->mtd, dirty_fm->offset, 0, dirty_fm->size); - flash_safe_release(fmc->mtd); - } - - D3(printk("jffs_write_dummy_node(): Leaving...\n")); - return 0; -} - - -/* Write a raw inode, possibly its name and possibly some data. */ -int -jffs_write_node(struct jffs_control *c, struct jffs_node *node, - struct jffs_raw_inode *raw_inode, - const char *name, const unsigned char *data, - int recoverable, - struct jffs_file *f) -{ - struct jffs_fmcontrol *fmc = c->fmc; - struct jffs_fm *fm; - struct kvec node_iovec[4]; - unsigned long iovec_cnt; - - __u32 pos; - int err; - __u32 slack = 0; - - __u32 total_name_size = raw_inode->nsize - + JFFS_GET_PAD_BYTES(raw_inode->nsize); - __u32 total_data_size = raw_inode->dsize - + JFFS_GET_PAD_BYTES(raw_inode->dsize); - __u32 total_size = sizeof(struct jffs_raw_inode) - + total_name_size + total_data_size; - - /* If this node isn't something that will eventually let - GC free even more space, then don't allow it unless - there's at least max_chunk_size space still available - */ - if (!recoverable) - slack = fmc->max_chunk_size; - - - /* Fire the retrorockets and shoot the fruiton torpedoes, sir! */ - - ASSERT(if (!node) { - printk("jffs_write_node(): node == NULL\n"); - return -EINVAL; - }); - ASSERT(if (raw_inode && raw_inode->nsize && !name) { - printk("*** jffs_write_node(): nsize = %u but name == NULL\n", - raw_inode->nsize); - return -EINVAL; - }); - - D1(printk("jffs_write_node(): filename = \"%s\", ino = %u, " - "total_size = %u\n", - (name ? name : ""), raw_inode->ino, - total_size)); - - jffs_fm_write_lock(fmc); - -retry: - fm = NULL; - err = 0; - while (!fm) { - - /* Deadlocks suck. */ - while(fmc->free_size < fmc->min_free_size + total_size + slack) { - jffs_fm_write_unlock(fmc); - if (!JFFS_ENOUGH_SPACE(c, total_size + slack)) - return -ENOSPC; - jffs_fm_write_lock(fmc); - } - - /* First try to allocate some flash memory. */ - err = jffs_fmalloc(fmc, total_size, node, &fm); - - if (err == -ENOSPC) { - /* Just out of space. GC and try again */ - if (fmc->dirty_size < fmc->sector_size) { - D(printk("jffs_write_node(): jffs_fmalloc(0x%p, %u) " - "failed, no dirty space to GC\n", fmc, - total_size)); - return err; - } - - D1(printk(KERN_INFO "jffs_write_node(): Calling jffs_garbage_collect_now()\n")); - jffs_fm_write_unlock(fmc); - if ((err = jffs_garbage_collect_now(c))) { - D(printk("jffs_write_node(): jffs_garbage_collect_now() failed\n")); - return err; - } - jffs_fm_write_lock(fmc); - continue; - } - - if (err < 0) { - jffs_fm_write_unlock(fmc); - - D(printk("jffs_write_node(): jffs_fmalloc(0x%p, %u) " - "failed!\n", fmc, total_size)); - return err; - } - - if (!fm->nodes) { - /* The jffs_fm struct that we got is not good enough. - Make that space dirty and try again */ - if ((err = jffs_write_dummy_node(c, fm)) < 0) { - kfree(fm); - DJM(no_jffs_fm--); - jffs_fm_write_unlock(fmc); - D(printk("jffs_write_node(): " - "jffs_write_dummy_node(): Failed!\n")); - return err; - } - fm = NULL; - } - } /* while(!fm) */ - node->fm = fm; - - ASSERT(if (fm->nodes == 0) { - printk(KERN_ERR "jffs_write_node(): fm->nodes == 0\n"); - }); - - pos = node->fm->offset; - - /* Increment the version number here. We can't let the caller - set it beforehand, because we might have had to do GC on a node - of this file - and we'd end up reusing version numbers. - */ - if (f) { - raw_inode->version = f->highest_version + 1; - D1(printk (KERN_NOTICE "jffs_write_node(): setting version of %s to %d\n", f->name, raw_inode->version)); - - /* if the file was deleted, set the deleted bit in the raw inode */ - if (f->deleted) - raw_inode->deleted = 1; - } - - /* Compute the checksum for the data and name chunks. */ - raw_inode->dchksum = jffs_checksum(data, raw_inode->dsize); - raw_inode->nchksum = jffs_checksum(name, raw_inode->nsize); - - /* The checksum is calculated without the chksum and accurate - fields so set them to zero first. */ - raw_inode->accurate = 0; - raw_inode->chksum = 0; - raw_inode->chksum = jffs_checksum(raw_inode, - sizeof(struct jffs_raw_inode)); - raw_inode->accurate = 0xff; - - D3(printk("jffs_write_node(): About to write this raw inode to the " - "flash at pos 0x%lx:\n", (long)pos)); - D3(jffs_print_raw_inode(raw_inode)); - - /* The actual raw JFFS node */ - node_iovec[0].iov_base = (void *) raw_inode; - node_iovec[0].iov_len = (size_t) sizeof(struct jffs_raw_inode); - iovec_cnt = 1; - - /* Get name and size if there is one */ - if (raw_inode->nsize) { - node_iovec[iovec_cnt].iov_base = (void *) name; - node_iovec[iovec_cnt].iov_len = (size_t) raw_inode->nsize; - iovec_cnt++; - - if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) { - static unsigned char allff[3]={255,255,255}; - /* Add some extra padding if necessary */ - node_iovec[iovec_cnt].iov_base = allff; - node_iovec[iovec_cnt].iov_len = - JFFS_GET_PAD_BYTES(raw_inode->nsize); - iovec_cnt++; - } - } - - /* Get data and size if there is any */ - if (raw_inode->dsize) { - node_iovec[iovec_cnt].iov_base = (void *) data; - node_iovec[iovec_cnt].iov_len = (size_t) raw_inode->dsize; - iovec_cnt++; - /* No need to pad this because we're not actually putting - anything after it. - */ - } - - if ((err = flash_safe_writev(fmc->mtd, node_iovec, iovec_cnt, - pos)) < 0) { - jffs_fmfree_partly(fmc, fm, 0); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_write_node: Failed to write, " - "requested %i, wrote %i\n", total_size, err); - goto retry; - } - if (raw_inode->deleted) - f->deleted = 1; - - jffs_fm_write_unlock(fmc); - D3(printk("jffs_write_node(): Leaving...\n")); - return raw_inode->dsize; -} /* jffs_write_node() */ - - -/* Read data from the node and write it to the buffer. 'node_offset' - is how much we have read from this particular node before and which - shouldn't be read again. 'max_size' is how much space there is in - the buffer. */ -static int -jffs_get_node_data(struct jffs_file *f, struct jffs_node *node, - unsigned char *buf,__u32 node_offset, __u32 max_size) -{ - struct jffs_fmcontrol *fmc = f->c->fmc; - __u32 pos = node->fm->offset + node->fm_offset + node_offset; - __u32 avail = node->data_size - node_offset; - __u32 r; - - D2(printk(" jffs_get_node_data(): file: \"%s\", ino: %u, " - "version: %u, node_offset: %u\n", - f->name, node->ino, node->version, node_offset)); - - r = min(avail, max_size); - D3(printk(KERN_NOTICE "jffs_get_node_data\n")); - flash_safe_read(fmc->mtd, pos, buf, r); - - D3(printk(" jffs_get_node_data(): Read %u byte%s.\n", - r, (r == 1 ? "" : "s"))); - - return r; -} - - -/* Read data from the file's nodes. Write the data to the buffer - 'buf'. 'read_offset' tells how much data we should skip. */ -int -jffs_read_data(struct jffs_file *f, unsigned char *buf, __u32 read_offset, - __u32 size) -{ - struct jffs_node *node; - __u32 read_data = 0; /* Total amount of read data. */ - __u32 node_offset = 0; - __u32 pos = 0; /* Number of bytes traversed. */ - - D2(printk("jffs_read_data(): file = \"%s\", read_offset = %d, " - "size = %u\n", - (f->name ? f->name : ""), read_offset, size)); - - if (read_offset >= f->size) { - D(printk(" f->size: %d\n", f->size)); - return 0; - } - - /* First find the node to read data from. */ - node = f->range_head; - while (pos <= read_offset) { - node_offset = read_offset - pos; - if (node_offset >= node->data_size) { - pos += node->data_size; - node = node->range_next; - } - else { - break; - } - } - - /* "Cats are living proof that not everything in nature - has to be useful." - - Garrison Keilor ('97) */ - - /* Fill the buffer. */ - while (node && (read_data < size)) { - int r; - if (!node->fm) { - /* This node does not refer to real data. */ - r = min(size - read_data, - node->data_size - node_offset); - memset(&buf[read_data], 0, r); - } - else if ((r = jffs_get_node_data(f, node, &buf[read_data], - node_offset, - size - read_data)) < 0) { - return r; - } - read_data += r; - node_offset = 0; - node = node->range_next; - } - D3(printk(" jffs_read_data(): Read %u bytes.\n", read_data)); - return read_data; -} - - -/* Used for traversing all nodes in the hash table. */ -int -jffs_foreach_file(struct jffs_control *c, int (*func)(struct jffs_file *)) -{ - int pos; - int r; - int result = 0; - - for (pos = 0; pos < c->hash_len; pos++) { - struct jffs_file *f, *next; - - /* We must do _safe, because 'func' might remove the - current file 'f' from the list. */ - list_for_each_entry_safe(f, next, &c->hash[pos], hash) { - r = func(f); - if (r < 0) - return r; - result += r; - } - } - - return result; -} - - -/* Free all nodes associated with a file. */ -static int -jffs_free_node_list(struct jffs_file *f) -{ - struct jffs_node *node; - struct jffs_node *p; - - D3(printk("jffs_free_node_list(): f #%u, \"%s\"\n", - f->ino, (f->name ? f->name : ""))); - node = f->version_head; - while (node) { - p = node; - node = node->version_next; - jffs_free_node(p); - DJM(no_jffs_node--); - } - return 0; -} - - -/* Free a file and its name. */ -static int -jffs_free_file(struct jffs_file *f) -{ - D3(printk("jffs_free_file: f #%u, \"%s\"\n", - f->ino, (f->name ? f->name : ""))); - - if (f->name) { - kfree(f->name); - DJM(no_name--); - } - kfree(f); - no_jffs_file--; - return 0; -} - -static long -jffs_get_file_count(void) -{ - return no_jffs_file; -} - -/* See if a file is deleted. If so, mark that file's nodes as obsolete. */ -int -jffs_possibly_delete_file(struct jffs_file *f) -{ - struct jffs_node *n; - - D3(printk("jffs_possibly_delete_file(): ino: %u\n", - f->ino)); - - ASSERT(if (!f) { - printk(KERN_ERR "jffs_possibly_delete_file(): f == NULL\n"); - return -1; - }); - - if (f->deleted) { - /* First try to remove all older versions. Commence with - the oldest node. */ - for (n = f->version_head; n; n = n->version_next) { - if (!n->fm) { - continue; - } - if (jffs_fmfree(f->c->fmc, n->fm, n) < 0) { - break; - } - } - /* Unlink the file from the filesystem. */ - if (!f->c->building_fs) { - jffs_unlink_file_from_tree(f); - } - jffs_unlink_file_from_hash(f); - jffs_free_node_list(f); - jffs_free_file(f); - } - return 0; -} - - -/* Used in conjunction with jffs_foreach_file() to count the number - of files in the file system. */ -int -jffs_file_count(struct jffs_file *f) -{ - return 1; -} - - -/* Build up a file's range list from scratch by going through the - version list. */ -static int -jffs_build_file(struct jffs_file *f) -{ - struct jffs_node *n; - - D3(printk("jffs_build_file(): ino: %u, name: \"%s\"\n", - f->ino, (f->name ? f->name : ""))); - - for (n = f->version_head; n; n = n->version_next) { - jffs_update_file(f, n); - } - return 0; -} - - -/* Remove an amount of data from a file. If this amount of data is - zero, that could mean that a node should be split in two parts. - We remove or change the appropriate nodes in the lists. - - Starting offset of area to be removed is node->data_offset, - and the length of the area is in node->removed_size. */ -static int -jffs_delete_data(struct jffs_file *f, struct jffs_node *node) -{ - struct jffs_node *n; - __u32 offset = node->data_offset; - __u32 remove_size = node->removed_size; - - D3(printk("jffs_delete_data(): offset = %u, remove_size = %u\n", - offset, remove_size)); - - if (remove_size == 0 - && f->range_tail - && f->range_tail->data_offset + f->range_tail->data_size - == offset) { - /* A simple append; nothing to remove or no node to split. */ - return 0; - } - - /* Find the node where we should begin the removal. */ - for (n = f->range_head; n; n = n->range_next) { - if (n->data_offset + n->data_size > offset) { - break; - } - } - if (!n) { - /* If there's no data in the file there's no data to - remove either. */ - return 0; - } - - if (n->data_offset > offset) { - /* XXX: Not implemented yet. */ - printk(KERN_WARNING "JFFS: An unexpected situation " - "occurred in jffs_delete_data.\n"); - } - else if (n->data_offset < offset) { - /* See if the node has to be split into two parts. */ - if (n->data_offset + n->data_size > offset + remove_size) { - /* Do the split. */ - struct jffs_node *new_node; - D3(printk("jffs_delete_data(): Split node with " - "version number %u.\n", n->version)); - - if (!(new_node = jffs_alloc_node())) { - D(printk("jffs_delete_data(): -ENOMEM\n")); - return -ENOMEM; - } - DJM(no_jffs_node++); - - new_node->ino = n->ino; - new_node->version = n->version; - new_node->data_offset = offset; - new_node->data_size = n->data_size - (remove_size + (offset - n->data_offset)); - new_node->fm_offset = n->fm_offset + (remove_size + (offset - n->data_offset)); - new_node->name_size = n->name_size; - new_node->fm = n->fm; - new_node->version_prev = n; - new_node->version_next = n->version_next; - if (new_node->version_next) { - new_node->version_next->version_prev - = new_node; - } - else { - f->version_tail = new_node; - } - n->version_next = new_node; - new_node->range_prev = n; - new_node->range_next = n->range_next; - if (new_node->range_next) { - new_node->range_next->range_prev = new_node; - } - else { - f->range_tail = new_node; - } - /* A very interesting can of worms. */ - n->range_next = new_node; - n->data_size = offset - n->data_offset; - if (new_node->fm) - jffs_add_node(new_node); - else { - D1(printk(KERN_WARNING "jffs_delete_data(): Splitting an empty node (file hold).\n!")); - D1(printk(KERN_WARNING "FIXME: Did dwmw2 do the right thing here?\n")); - } - n = new_node->range_next; - remove_size = 0; - } - else { - /* No. No need to split the node. Just remove - the end of the node. */ - int r = min(n->data_offset + n->data_size - - offset, remove_size); - n->data_size -= r; - remove_size -= r; - n = n->range_next; - } - } - - /* Remove as many nodes as necessary. */ - while (n && remove_size) { - if (n->data_size <= remove_size) { - struct jffs_node *p = n; - remove_size -= n->data_size; - n = n->range_next; - D3(printk("jffs_delete_data(): Removing node: " - "ino: %u, version: %u%s\n", - p->ino, p->version, - (p->fm ? "" : " (virtual)"))); - if (p->fm) { - jffs_fmfree(f->c->fmc, p->fm, p); - } - jffs_unlink_node_from_range_list(f, p); - jffs_unlink_node_from_version_list(f, p); - jffs_free_node(p); - DJM(no_jffs_node--); - } - else { - n->data_size -= remove_size; - n->fm_offset += remove_size; - n->data_offset -= (node->removed_size - remove_size); - n = n->range_next; - break; - } - } - - /* Adjust the following nodes' information about offsets etc. */ - while (n && node->removed_size) { - n->data_offset -= node->removed_size; - n = n->range_next; - } - - if (node->removed_size > (f->size - node->data_offset)) { - /* It's possible that the removed_size is in fact - * greater than the amount of data we actually thought - * were present in the first place - some of the nodes - * which this node originally obsoleted may already have - * been deleted from the flash by subsequent garbage - * collection. - * - * If this is the case, don't let f->size go negative. - * Bad things would happen :) - */ - f->size = node->data_offset; - } else { - f->size -= node->removed_size; - } - D3(printk("jffs_delete_data(): f->size = %d\n", f->size)); - return 0; -} /* jffs_delete_data() */ - - -/* Insert some data into a file. Prior to the call to this function, - jffs_delete_data should be called. */ -static int -jffs_insert_data(struct jffs_file *f, struct jffs_node *node) -{ - D3(printk("jffs_insert_data(): node->data_offset = %u, " - "node->data_size = %u, f->size = %u\n", - node->data_offset, node->data_size, f->size)); - - /* Find the position where we should insert data. */ - retry: - if (node->data_offset == f->size) { - /* A simple append. This is the most common operation. */ - node->range_next = NULL; - node->range_prev = f->range_tail; - if (node->range_prev) { - node->range_prev->range_next = node; - } - f->range_tail = node; - f->size += node->data_size; - if (!f->range_head) { - f->range_head = node; - } - } - else if (node->data_offset < f->size) { - /* Trying to insert data into the middle of the file. This - means no problem because jffs_delete_data() has already - prepared the range list for us. */ - struct jffs_node *n; - - /* Find the correct place for the insertion and then insert - the node. */ - for (n = f->range_head; n; n = n->range_next) { - D2(printk("Cool stuff's happening!\n")); - - if (n->data_offset == node->data_offset) { - node->range_prev = n->range_prev; - if (node->range_prev) { - node->range_prev->range_next = node; - } - else { - f->range_head = node; - } - node->range_next = n; - n->range_prev = node; - break; - } - ASSERT(else if (n->data_offset + n->data_size > - node->data_offset) { - printk(KERN_ERR "jffs_insert_data(): " - "Couldn't find a place to insert " - "the data!\n"); - return -1; - }); - } - - /* Adjust later nodes' offsets etc. */ - n = node->range_next; - while (n) { - n->data_offset += node->data_size; - n = n->range_next; - } - f->size += node->data_size; - } - else if (node->data_offset > f->size) { - /* Okay. This is tricky. This means that we want to insert - data at a place that is beyond the limits of the file as - it is constructed right now. This is actually a common - event that for instance could occur during the mounting - of the file system if a large file have been truncated, - rewritten and then only partially garbage collected. */ - - struct jffs_node *n; - - /* We need a place holder for the data that is missing in - front of this insertion. This "virtual node" will not - be associated with any space on the flash device. */ - struct jffs_node *virtual_node; - if (!(virtual_node = jffs_alloc_node())) { - return -ENOMEM; - } - - D(printk("jffs_insert_data: Inserting a virtual node.\n")); - D(printk(" node->data_offset = %u\n", node->data_offset)); - D(printk(" f->size = %u\n", f->size)); - - virtual_node->ino = node->ino; - virtual_node->version = node->version; - virtual_node->removed_size = 0; - virtual_node->fm_offset = 0; - virtual_node->name_size = 0; - virtual_node->fm = NULL; /* This is a virtual data holder. */ - virtual_node->version_prev = NULL; - virtual_node->version_next = NULL; - virtual_node->range_next = NULL; - - /* Are there any data at all in the file yet? */ - if (f->range_head) { - virtual_node->data_offset - = f->range_tail->data_offset - + f->range_tail->data_size; - virtual_node->data_size - = node->data_offset - virtual_node->data_offset; - virtual_node->range_prev = f->range_tail; - f->range_tail->range_next = virtual_node; - } - else { - virtual_node->data_offset = 0; - virtual_node->data_size = node->data_offset; - virtual_node->range_prev = NULL; - f->range_head = virtual_node; - } - - f->range_tail = virtual_node; - f->size += virtual_node->data_size; - - /* Insert this virtual node in the version list as well. */ - for (n = f->version_head; n ; n = n->version_next) { - if (n->version == virtual_node->version) { - virtual_node->version_prev = n->version_prev; - n->version_prev = virtual_node; - if (virtual_node->version_prev) { - virtual_node->version_prev - ->version_next = virtual_node; - } - else { - f->version_head = virtual_node; - } - virtual_node->version_next = n; - break; - } - } - - D(jffs_print_node(virtual_node)); - - /* Make a new try to insert the node. */ - goto retry; - } - - D3(printk("jffs_insert_data(): f->size = %d\n", f->size)); - return 0; -} - - -/* A new node (with data) has been added to the file and now the range - list has to be modified. */ -static int -jffs_update_file(struct jffs_file *f, struct jffs_node *node) -{ - int err; - - D3(printk("jffs_update_file(): ino: %u, version: %u\n", - f->ino, node->version)); - - if (node->data_size == 0) { - if (node->removed_size == 0) { - /* data_offset == X */ - /* data_size == 0 */ - /* remove_size == 0 */ - } - else { - /* data_offset == X */ - /* data_size == 0 */ - /* remove_size != 0 */ - if ((err = jffs_delete_data(f, node)) < 0) { - return err; - } - } - } - else { - /* data_offset == X */ - /* data_size != 0 */ - /* remove_size == Y */ - if ((err = jffs_delete_data(f, node)) < 0) { - return err; - } - if ((err = jffs_insert_data(f, node)) < 0) { - return err; - } - } - return 0; -} - -/* Print the contents of a file. */ -#if 0 -int -jffs_print_file(struct jffs_file *f) -{ - D(int i); - D(printk("jffs_file: 0x%p\n", f)); - D(printk("{\n")); - D(printk(" 0x%08x, /* ino */\n", f->ino)); - D(printk(" 0x%08x, /* pino */\n", f->pino)); - D(printk(" 0x%08x, /* mode */\n", f->mode)); - D(printk(" 0x%04x, /* uid */\n", f->uid)); - D(printk(" 0x%04x, /* gid */\n", f->gid)); - D(printk(" 0x%08x, /* atime */\n", f->atime)); - D(printk(" 0x%08x, /* mtime */\n", f->mtime)); - D(printk(" 0x%08x, /* ctime */\n", f->ctime)); - D(printk(" 0x%02x, /* nsize */\n", f->nsize)); - D(printk(" 0x%02x, /* nlink */\n", f->nlink)); - D(printk(" 0x%02x, /* deleted */\n", f->deleted)); - D(printk(" \"%s\", ", (f->name ? f->name : ""))); - D(for (i = strlen(f->name ? f->name : ""); i < 8; ++i) { - printk(" "); - }); - D(printk("/* name */\n")); - D(printk(" 0x%08x, /* size */\n", f->size)); - D(printk(" 0x%08x, /* highest_version */\n", - f->highest_version)); - D(printk(" 0x%p, /* c */\n", f->c)); - D(printk(" 0x%p, /* parent */\n", f->parent)); - D(printk(" 0x%p, /* children */\n", f->children)); - D(printk(" 0x%p, /* sibling_prev */\n", f->sibling_prev)); - D(printk(" 0x%p, /* sibling_next */\n", f->sibling_next)); - D(printk(" 0x%p, /* hash_prev */\n", f->hash.prev)); - D(printk(" 0x%p, /* hash_next */\n", f->hash.next)); - D(printk(" 0x%p, /* range_head */\n", f->range_head)); - D(printk(" 0x%p, /* range_tail */\n", f->range_tail)); - D(printk(" 0x%p, /* version_head */\n", f->version_head)); - D(printk(" 0x%p, /* version_tail */\n", f->version_tail)); - D(printk("}\n")); - return 0; -} -#endif /* 0 */ - -void -jffs_print_hash_table(struct jffs_control *c) -{ - int i; - - printk("JFFS: Dumping the file system's hash table...\n"); - for (i = 0; i < c->hash_len; i++) { - struct jffs_file *f; - list_for_each_entry(f, &c->hash[i], hash) { - printk("*** c->hash[%u]: \"%s\" " - "(ino: %u, pino: %u)\n", - i, (f->name ? f->name : ""), - f->ino, f->pino); - } - } -} - - -void -jffs_print_tree(struct jffs_file *first_file, int indent) -{ - struct jffs_file *f; - char *space; - int dir; - - if (!first_file) { - return; - } - - if (!(space = kmalloc(indent + 1, GFP_KERNEL))) { - printk("jffs_print_tree(): Out of memory!\n"); - return; - } - - memset(space, ' ', indent); - space[indent] = '\0'; - - for (f = first_file; f; f = f->sibling_next) { - dir = S_ISDIR(f->mode); - printk("%s%s%s (ino: %u, highest_version: %u, size: %u)\n", - space, (f->name ? f->name : ""), (dir ? "/" : ""), - f->ino, f->highest_version, f->size); - if (dir) { - jffs_print_tree(f->children, indent + 2); - } - } - - kfree(space); -} - - -#if defined(JFFS_MEMORY_DEBUG) && JFFS_MEMORY_DEBUG -void -jffs_print_memory_allocation_statistics(void) -{ - static long printout; - printk("________ Memory printout #%ld ________\n", ++printout); - printk("no_jffs_file = %ld\n", no_jffs_file); - printk("no_jffs_node = %ld\n", no_jffs_node); - printk("no_jffs_control = %ld\n", no_jffs_control); - printk("no_jffs_raw_inode = %ld\n", no_jffs_raw_inode); - printk("no_jffs_node_ref = %ld\n", no_jffs_node_ref); - printk("no_jffs_fm = %ld\n", no_jffs_fm); - printk("no_jffs_fmcontrol = %ld\n", no_jffs_fmcontrol); - printk("no_hash = %ld\n", no_hash); - printk("no_name = %ld\n", no_name); - printk("\n"); -} -#endif - - -/* Rewrite `size' bytes, and begin at `node'. */ -static int -jffs_rewrite_data(struct jffs_file *f, struct jffs_node *node, __u32 size) -{ - struct jffs_control *c = f->c; - struct jffs_fmcontrol *fmc = c->fmc; - struct jffs_raw_inode raw_inode; - struct jffs_node *new_node; - struct jffs_fm *fm; - __u32 pos; - __u32 pos_dchksum; - __u32 total_name_size; - __u32 total_data_size; - __u32 total_size; - int err; - - D1(printk("***jffs_rewrite_data(): node: %u, name: \"%s\", size: %u\n", - f->ino, (f->name ? f->name : "(null)"), size)); - - /* Create and initialize the new node. */ - if (!(new_node = jffs_alloc_node())) { - D(printk("jffs_rewrite_data(): " - "Failed to allocate node.\n")); - return -ENOMEM; - } - DJM(no_jffs_node++); - new_node->data_offset = node->data_offset; - new_node->removed_size = size; - total_name_size = JFFS_PAD(f->nsize); - total_data_size = JFFS_PAD(size); - total_size = sizeof(struct jffs_raw_inode) - + total_name_size + total_data_size; - new_node->fm_offset = sizeof(struct jffs_raw_inode) - + total_name_size; - -retry: - jffs_fm_write_lock(fmc); - err = 0; - - if ((err = jffs_fmalloc(fmc, total_size, new_node, &fm)) < 0) { - DJM(no_jffs_node--); - jffs_fm_write_unlock(fmc); - D(printk("jffs_rewrite_data(): Failed to allocate fm.\n")); - jffs_free_node(new_node); - return err; - } - else if (!fm->nodes) { - /* The jffs_fm struct that we got is not big enough. */ - /* This should never happen, because we deal with this case - in jffs_garbage_collect_next().*/ - printk(KERN_WARNING "jffs_rewrite_data(): Allocated node is too small (%d bytes of %d)\n", fm->size, total_size); - if ((err = jffs_write_dummy_node(c, fm)) < 0) { - D(printk("jffs_rewrite_data(): " - "jffs_write_dummy_node() Failed!\n")); - } else { - err = -ENOSPC; - } - DJM(no_jffs_fm--); - jffs_fm_write_unlock(fmc); - kfree(fm); - - return err; - } - new_node->fm = fm; - - /* Initialize the raw inode. */ - raw_inode.magic = JFFS_MAGIC_BITMASK; - raw_inode.ino = f->ino; - raw_inode.pino = f->pino; - raw_inode.version = f->highest_version + 1; - raw_inode.mode = f->mode; - raw_inode.uid = f->uid; - raw_inode.gid = f->gid; - raw_inode.atime = f->atime; - raw_inode.mtime = f->mtime; - raw_inode.ctime = f->ctime; - raw_inode.offset = node->data_offset; - raw_inode.dsize = size; - raw_inode.rsize = size; - raw_inode.nsize = f->nsize; - raw_inode.nlink = f->nlink; - raw_inode.spare = 0; - raw_inode.rename = 0; - raw_inode.deleted = f->deleted; - raw_inode.accurate = 0xff; - raw_inode.dchksum = 0; - raw_inode.nchksum = 0; - - pos = new_node->fm->offset; - pos_dchksum = pos +JFFS_RAW_INODE_DCHKSUM_OFFSET; - - D3(printk("jffs_rewrite_data(): Writing this raw inode " - "to pos 0x%ul.\n", pos)); - D3(jffs_print_raw_inode(&raw_inode)); - - if ((err = flash_safe_write(fmc->mtd, pos, - (u_char *) &raw_inode, - sizeof(struct jffs_raw_inode) - - sizeof(__u32) - - sizeof(__u16) - sizeof(__u16))) < 0) { - jffs_fmfree_partly(fmc, fm, - total_name_size + total_data_size); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_rewrite_data: Write error during " - "rewrite. (raw inode)\n"); - printk(KERN_ERR "JFFS: jffs_rewrite_data: Now retrying " - "rewrite. (raw inode)\n"); - goto retry; - } - pos += sizeof(struct jffs_raw_inode); - - /* Write the name to the flash memory. */ - if (f->nsize) { - D3(printk("jffs_rewrite_data(): Writing name \"%s\" to " - "pos 0x%ul.\n", f->name, (unsigned int) pos)); - if ((err = flash_safe_write(fmc->mtd, pos, - (u_char *)f->name, - f->nsize)) < 0) { - jffs_fmfree_partly(fmc, fm, total_data_size); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_rewrite_data: Write " - "error during rewrite. (name)\n"); - printk(KERN_ERR "JFFS: jffs_rewrite_data: Now retrying " - "rewrite. (name)\n"); - goto retry; - } - pos += total_name_size; - raw_inode.nchksum = jffs_checksum(f->name, f->nsize); - } - - /* Write the data. */ - if (size) { - int r; - unsigned char *page; - __u32 offset = node->data_offset; - - if (!(page = (unsigned char *)__get_free_page(GFP_KERNEL))) { - jffs_fmfree_partly(fmc, fm, 0); - return -1; - } - - while (size) { - __u32 s = min(size, (__u32)PAGE_SIZE); - if ((r = jffs_read_data(f, (char *)page, - offset, s)) < s) { - free_page((unsigned long)page); - jffs_fmfree_partly(fmc, fm, 0); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_rewrite_data: " - "jffs_read_data() " - "failed! (r = %d)\n", r); - return -1; - } - if ((err = flash_safe_write(fmc->mtd, - pos, page, r)) < 0) { - free_page((unsigned long)page); - jffs_fmfree_partly(fmc, fm, 0); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_rewrite_data: " - "Write error during rewrite. " - "(data)\n"); - goto retry; - } - pos += r; - size -= r; - offset += r; - raw_inode.dchksum += jffs_checksum(page, r); - } - - free_page((unsigned long)page); - } - - raw_inode.accurate = 0; - raw_inode.chksum = jffs_checksum(&raw_inode, - sizeof(struct jffs_raw_inode) - - sizeof(__u16)); - - /* Add the checksum. */ - if ((err - = flash_safe_write(fmc->mtd, pos_dchksum, - &((u_char *) - &raw_inode)[JFFS_RAW_INODE_DCHKSUM_OFFSET], - sizeof(__u32) + sizeof(__u16) - + sizeof(__u16))) < 0) { - jffs_fmfree_partly(fmc, fm, 0); - jffs_fm_write_unlock(fmc); - printk(KERN_ERR "JFFS: jffs_rewrite_data: Write error during " - "rewrite. (checksum)\n"); - goto retry; - } - - /* Now make the file system aware of the newly written node. */ - jffs_insert_node(c, f, &raw_inode, f->name, new_node); - jffs_fm_write_unlock(fmc); - - D3(printk("jffs_rewrite_data(): Leaving...\n")); - return 0; -} /* jffs_rewrite_data() */ - - -/* jffs_garbage_collect_next implements one step in the garbage collect - process and is often called multiple times at each occasion of a - garbage collect. */ - -static int -jffs_garbage_collect_next(struct jffs_control *c) -{ - struct jffs_fmcontrol *fmc = c->fmc; - struct jffs_node *node; - struct jffs_file *f; - int err = 0; - __u32 size; - __u32 data_size; - __u32 total_name_size; - __u32 extra_available; - __u32 space_needed; - __u32 free_chunk_size1 = jffs_free_size1(fmc); - D2(__u32 free_chunk_size2 = jffs_free_size2(fmc)); - - /* Get the oldest node in the flash. */ - node = jffs_get_oldest_node(fmc); - ASSERT(if (!node) { - printk(KERN_ERR "JFFS: jffs_garbage_collect_next: " - "No oldest node found!\n"); - err = -1; - goto jffs_garbage_collect_next_end; - - - }); - - /* Find its corresponding file too. */ - f = jffs_find_file(c, node->ino); - - if (!f) { - printk (KERN_ERR "JFFS: jffs_garbage_collect_next: " - "No file to garbage collect! " - "(ino = 0x%08x)\n", node->ino); - /* FIXME: Free the offending node and recover. */ - err = -1; - goto jffs_garbage_collect_next_end; - } - - /* We always write out the name. Theoretically, we don't need - to, but for now it's easier - because otherwise we'd have - to keep track of how many times the current name exists on - the flash and make sure it never reaches zero. - - The current approach means that would be possible to cause - the GC to end up eating its tail by writing lots of nodes - with no name for it to garbage-collect. Hence the change in - inode.c to write names with _every_ node. - - It sucks, but it _should_ work. - */ - total_name_size = JFFS_PAD(f->nsize); - - D1(printk("jffs_garbage_collect_next(): \"%s\", " - "ino: %u, version: %u, location 0x%x, dsize %u\n", - (f->name ? f->name : ""), node->ino, node->version, - node->fm->offset, node->data_size)); - - /* Compute how many data it's possible to rewrite at the moment. */ - data_size = f->size - node->data_offset; - - /* And from that, the total size of the chunk we want to write */ - size = sizeof(struct jffs_raw_inode) + total_name_size - + data_size + JFFS_GET_PAD_BYTES(data_size); - - /* If that's more than max_chunk_size, reduce it accordingly */ - if (size > fmc->max_chunk_size) { - size = fmc->max_chunk_size; - data_size = size - sizeof(struct jffs_raw_inode) - - total_name_size; - } - - /* If we're asking to take up more space than free_chunk_size1 - but we _could_ fit in it, shrink accordingly. - */ - if (size > free_chunk_size1) { - - if (free_chunk_size1 < - (sizeof(struct jffs_raw_inode) + total_name_size + BLOCK_SIZE)){ - /* The space left is too small to be of any - use really. */ - struct jffs_fm *dirty_fm - = jffs_fmalloced(fmc, - fmc->tail->offset + fmc->tail->size, - free_chunk_size1, NULL); - if (!dirty_fm) { - printk(KERN_ERR "JFFS: " - "jffs_garbage_collect_next: " - "Failed to allocate `dirty' " - "flash memory!\n"); - err = -1; - goto jffs_garbage_collect_next_end; - } - D1(printk("Dirtying end of flash - too small\n")); - jffs_write_dummy_node(c, dirty_fm); - err = 0; - goto jffs_garbage_collect_next_end; - } - D1(printk("Reducing size of new node from %d to %d to avoid " - " exceeding free_chunk_size1\n", - size, free_chunk_size1)); - - size = free_chunk_size1; - data_size = size - sizeof(struct jffs_raw_inode) - - total_name_size; - } - - - /* Calculate the amount of space needed to hold the nodes - which are remaining in the tail */ - space_needed = fmc->min_free_size - (node->fm->offset % fmc->sector_size); - - /* From that, calculate how much 'extra' space we can use to - increase the size of the node we're writing from the size - of the node we're obsoleting - */ - if (space_needed > fmc->free_size) { - /* If we've gone below min_free_size for some reason, - don't fuck up. This is why we have - min_free_size > sector_size. Whinge about it though, - just so I can convince myself my maths is right. - */ - D1(printk(KERN_WARNING "jffs_garbage_collect_next(): " - "space_needed %d exceeded free_size %d\n", - space_needed, fmc->free_size)); - extra_available = 0; - } else { - extra_available = fmc->free_size - space_needed; - } - - /* Check that we don't use up any more 'extra' space than - what's available */ - if (size > JFFS_PAD(node->data_size) + total_name_size + - sizeof(struct jffs_raw_inode) + extra_available) { - D1(printk("Reducing size of new node from %d to %ld to avoid " - "catching our tail\n", size, - (long) (JFFS_PAD(node->data_size) + JFFS_PAD(node->name_size) + - sizeof(struct jffs_raw_inode) + extra_available))); - D1(printk("space_needed = %d, extra_available = %d\n", - space_needed, extra_available)); - - size = JFFS_PAD(node->data_size) + total_name_size + - sizeof(struct jffs_raw_inode) + extra_available; - data_size = size - sizeof(struct jffs_raw_inode) - - total_name_size; - }; - - D2(printk(" total_name_size: %u\n", total_name_size)); - D2(printk(" data_size: %u\n", data_size)); - D2(printk(" size: %u\n", size)); - D2(printk(" f->nsize: %u\n", f->nsize)); - D2(printk(" f->size: %u\n", f->size)); - D2(printk(" node->data_offset: %u\n", node->data_offset)); - D2(printk(" free_chunk_size1: %u\n", free_chunk_size1)); - D2(printk(" free_chunk_size2: %u\n", free_chunk_size2)); - D2(printk(" node->fm->offset: 0x%08x\n", node->fm->offset)); - - if ((err = jffs_rewrite_data(f, node, data_size))) { - printk(KERN_WARNING "jffs_rewrite_data() failed: %d\n", err); - return err; - } - -jffs_garbage_collect_next_end: - D3(printk("jffs_garbage_collect_next: Leaving...\n")); - return err; -} /* jffs_garbage_collect_next */ - - -/* If an obsolete node is partly going to be erased due to garbage - collection, the part that isn't going to be erased must be filled - with zeroes so that the scan of the flash will work smoothly next - time. (The data in the file could for instance be a JFFS image - which could cause enormous confusion during a scan of the flash - device if we didn't do this.) - There are two phases in this procedure: First, the clearing of - the name and data parts of the node. Second, possibly also clearing - a part of the raw inode as well. If the box is power cycled during - the first phase, only the checksum of this node-to-be-cleared-at- - the-end will be wrong. If the box is power cycled during, or after, - the clearing of the raw inode, the information like the length of - the name and data parts are zeroed. The next time the box is - powered up, the scanning algorithm manages this faulty data too - because: - - - The checksum is invalid and thus the raw inode must be discarded - in any case. - - If the lengths of the data part or the name part are zeroed, the - scanning just continues after the raw inode. But after the inode - the scanning procedure just finds zeroes which is the same as - dirt. - - So, in the end, this could never fail. :-) Even if it does fail, - the scanning algorithm should manage that too. */ - -static int -jffs_clear_end_of_node(struct jffs_control *c, __u32 erase_size) -{ - struct jffs_fm *fm; - struct jffs_fmcontrol *fmc = c->fmc; - __u32 zero_offset; - __u32 zero_size; - __u32 zero_offset_data; - __u32 zero_size_data; - __u32 cutting_raw_inode = 0; - - if (!(fm = jffs_cut_node(fmc, erase_size))) { - D3(printk("jffs_clear_end_of_node(): fm == NULL\n")); - return 0; - } - - /* Where and how much shall we clear? */ - zero_offset = fmc->head->offset + erase_size; - zero_size = fm->offset + fm->size - zero_offset; - - /* Do we have to clear the raw_inode explicitly? */ - if (fm->size - zero_size < sizeof(struct jffs_raw_inode)) { - cutting_raw_inode = sizeof(struct jffs_raw_inode) - - (fm->size - zero_size); - } - - /* First, clear the name and data fields. */ - zero_offset_data = zero_offset + cutting_raw_inode; - zero_size_data = zero_size - cutting_raw_inode; - flash_safe_acquire(fmc->mtd); - flash_memset(fmc->mtd, zero_offset_data, 0, zero_size_data); - flash_safe_release(fmc->mtd); - - /* Should we clear a part of the raw inode? */ - if (cutting_raw_inode) { - /* I guess it is ok to clear the raw inode in this order. */ - flash_safe_acquire(fmc->mtd); - flash_memset(fmc->mtd, zero_offset, 0, - cutting_raw_inode); - flash_safe_release(fmc->mtd); - } - - return 0; -} /* jffs_clear_end_of_node() */ - -/* Try to erase as much as possible of the dirt in the flash memory. */ -static long -jffs_try_to_erase(struct jffs_control *c) -{ - struct jffs_fmcontrol *fmc = c->fmc; - long erase_size; - int err; - __u32 offset; - - D3(printk("jffs_try_to_erase()\n")); - - erase_size = jffs_erasable_size(fmc); - - D2(printk("jffs_try_to_erase(): erase_size = %ld\n", erase_size)); - - if (erase_size == 0) { - return 0; - } - else if (erase_size < 0) { - printk(KERN_ERR "JFFS: jffs_try_to_erase: " - "jffs_erasable_size returned %ld.\n", erase_size); - return erase_size; - } - - if ((err = jffs_clear_end_of_node(c, erase_size)) < 0) { - printk(KERN_ERR "JFFS: jffs_try_to_erase: " - "Clearing of node failed.\n"); - return err; - } - - offset = fmc->head->offset; - - /* Now, let's try to do the erase. */ - if ((err = flash_erase_region(fmc->mtd, - offset, erase_size)) < 0) { - printk(KERN_ERR "JFFS: Erase of flash failed. " - "offset = %u, erase_size = %ld\n", - offset, erase_size); - /* XXX: Here we should allocate this area as dirty - with jffs_fmalloced or something similar. Now - we just report the error. */ - return err; - } - -#if 0 - /* Check if the erased sectors really got erased. */ - { - __u32 pos; - __u32 end; - - pos = (__u32)flash_get_direct_pointer(to_kdev_t(c->sb->s_dev), offset); - end = pos + erase_size; - - D2(printk("JFFS: Checking erased sector(s)...\n")); - - flash_safe_acquire(fmc->mtd); - - for (; pos < end; pos += 4) { - if (*(__u32 *)pos != JFFS_EMPTY_BITMASK) { - printk("JFFS: Erase failed! pos = 0x%lx\n", - (long)pos); - jffs_hexdump(fmc->mtd, pos, - jffs_min(256, end - pos)); - err = -1; - break; - } - } - - flash_safe_release(fmc->mtd); - - if (!err) { - D2(printk("JFFS: Erase succeeded.\n")); - } - else { - /* XXX: Here we should allocate the memory - with jffs_fmalloced() in order to prevent - JFFS from using this area accidentally. */ - return err; - } - } -#endif - - /* Update the flash memory data structures. */ - jffs_sync_erase(fmc, erase_size); - - return erase_size; -} - - -/* There are different criteria that should trigger a garbage collect: - - 1. There is too much dirt in the memory. - 2. The free space is becoming small. - 3. There are many versions of a node. - - The garbage collect should always be done in a manner that guarantees - that future garbage collects cannot be locked. E.g. Rewritten chunks - should not be too large (span more than one sector in the flash memory - for exemple). Of course there is a limit on how intelligent this garbage - collection can be. */ - - -static int -jffs_garbage_collect_now(struct jffs_control *c) -{ - struct jffs_fmcontrol *fmc = c->fmc; - long erased = 0; - int result = 0; - D1(int i = 1); - D2(printk("***jffs_garbage_collect_now(): fmc->dirty_size = %u, fmc->free_size = 0x%x\n, fcs1=0x%x, fcs2=0x%x", - fmc->dirty_size, fmc->free_size, jffs_free_size1(fmc), jffs_free_size2(fmc))); - D2(jffs_print_fmcontrol(fmc)); - - // down(&fmc->gclock); - - /* If it is possible to garbage collect, do so. */ - - while (erased == 0) { - D1(printk("***jffs_garbage_collect_now(): round #%u, " - "fmc->dirty_size = %u\n", i++, fmc->dirty_size)); - D2(jffs_print_fmcontrol(fmc)); - - if ((erased = jffs_try_to_erase(c)) < 0) { - printk(KERN_WARNING "JFFS: Error in " - "garbage collector.\n"); - result = erased; - goto gc_end; - } - if (erased) - break; - - if (fmc->free_size == 0) { - /* Argh */ - printk(KERN_ERR "jffs_garbage_collect_now(): free_size == 0. This is BAD.\n"); - result = -ENOSPC; - break; - } - - if (fmc->dirty_size < fmc->sector_size) { - /* Actually, we _may_ have been able to free some, - * if there are many overlapping nodes which aren't - * actually marked dirty because they still have - * some valid data in each. - */ - result = -ENOSPC; - break; - } - - /* Let's dare to make a garbage collect. */ - if ((result = jffs_garbage_collect_next(c)) < 0) { - printk(KERN_ERR "JFFS: Something " - "has gone seriously wrong " - "with a garbage collect.\n"); - goto gc_end; - } - - D1(printk(" jffs_garbage_collect_now(): erased: %ld\n", erased)); - DJM(jffs_print_memory_allocation_statistics()); - } - -gc_end: - // up(&fmc->gclock); - - D3(printk(" jffs_garbage_collect_now(): Leaving...\n")); - D1(if (erased) { - printk("jffs_g_c_now(): erased = %ld\n", erased); - jffs_print_fmcontrol(fmc); - }); - - if (!erased && !result) - return -ENOSPC; - - return result; -} /* jffs_garbage_collect_now() */ - - -/* Determine if it is reasonable to start garbage collection. - We start a gc pass if either: - - The number of free bytes < MIN_FREE_BYTES && at least one - block is dirty, OR - - The number of dirty bytes > MAX_DIRTY_BYTES -*/ -static inline int thread_should_wake (struct jffs_control *c) -{ - D1(printk (KERN_NOTICE "thread_should_wake(): free=%d, dirty=%d, blocksize=%d.\n", - c->fmc->free_size, c->fmc->dirty_size, c->fmc->sector_size)); - - /* If there's not enough dirty space to free a block, there's no point. */ - if (c->fmc->dirty_size < c->fmc->sector_size) { - D2(printk(KERN_NOTICE "thread_should_wake(): Not waking. Insufficient dirty space\n")); - return 0; - } -#if 1 - /* If there is too much RAM used by the various structures, GC */ - if (jffs_get_node_inuse() > (c->fmc->used_size/c->fmc->max_chunk_size * 5 + jffs_get_file_count() * 2 + 50)) { - /* FIXME: Provide proof that this test can be satisfied. We - don't want a filesystem doing endless GC just because this - condition cannot ever be false. - */ - D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to number of nodes\n")); - return 1; - } -#endif - /* If there are fewer free bytes than the threshold, GC */ - if (c->fmc->free_size < c->gc_minfree_threshold) { - D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to insufficent free space\n")); - return 1; - } - /* If there are more dirty bytes than the threshold, GC */ - if (c->fmc->dirty_size > c->gc_maxdirty_threshold) { - D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to excessive dirty space\n")); - return 1; - } - /* FIXME: What about the "There are many versions of a node" condition? */ - - return 0; -} - - -void jffs_garbage_collect_trigger(struct jffs_control *c) -{ - /* NOTE: We rely on the fact that we have the BKL here. - * Otherwise, the gc_task could go away between the check - * and the wake_up_process() - */ - if (c->gc_task && thread_should_wake(c)) - send_sig(SIGHUP, c->gc_task, 1); -} - - -/* Kernel threads take (void *) as arguments. Thus we pass - the jffs_control data as a (void *) and then cast it. */ -int -jffs_garbage_collect_thread(void *ptr) -{ - struct jffs_control *c = (struct jffs_control *) ptr; - struct jffs_fmcontrol *fmc = c->fmc; - long erased; - int result = 0; - D1(int i = 1); - - daemonize("jffs_gcd"); - - c->gc_task = current; - - lock_kernel(); - init_completion(&c->gc_thread_comp); /* barrier */ - spin_lock_irq(¤t->sighand->siglock); - siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - D1(printk (KERN_NOTICE "jffs_garbage_collect_thread(): Starting infinite loop.\n")); - - for (;;) { - - /* See if we need to start gc. If we don't, go to sleep. - - Current implementation is a BAD THING(tm). If we try - to unmount the FS, the unmount operation will sleep waiting - for this thread to exit. We need to arrange to send it a - sig before the umount process sleeps. - */ - - if (!thread_should_wake(c)) - set_current_state (TASK_INTERRUPTIBLE); - - schedule(); /* Yes, we do this even if we want to go - on immediately - we're a low priority - background task. */ - - /* Put_super will send a SIGKILL and then wait on the sem. - */ - while (signal_pending(current)) { - siginfo_t info; - unsigned long signr = 0; - - if (try_to_freeze()) - continue; - - spin_lock_irq(¤t->sighand->siglock); - signr = dequeue_signal(current, ¤t->blocked, &info); - spin_unlock_irq(¤t->sighand->siglock); - - switch(signr) { - case SIGSTOP: - D1(printk("jffs_garbage_collect_thread(): SIGSTOP received.\n")); - set_current_state(TASK_STOPPED); - schedule(); - break; - - case SIGKILL: - D1(printk("jffs_garbage_collect_thread(): SIGKILL received.\n")); - c->gc_task = NULL; - complete_and_exit(&c->gc_thread_comp, 0); - } - } - - - D1(printk (KERN_NOTICE "jffs_garbage_collect_thread(): collecting.\n")); - - D3(printk (KERN_NOTICE "g_c_thread(): down biglock\n")); - mutex_lock(&fmc->biglock); - - D1(printk("***jffs_garbage_collect_thread(): round #%u, " - "fmc->dirty_size = %u\n", i++, fmc->dirty_size)); - D2(jffs_print_fmcontrol(fmc)); - - if ((erased = jffs_try_to_erase(c)) < 0) { - printk(KERN_WARNING "JFFS: Error in " - "garbage collector: %ld.\n", erased); - } - - if (erased) - goto gc_end; - - if (fmc->free_size == 0) { - /* Argh. Might as well commit suicide. */ - printk(KERN_ERR "jffs_garbage_collect_thread(): free_size == 0. This is BAD.\n"); - send_sig(SIGQUIT, c->gc_task, 1); - // panic() - goto gc_end; - } - - /* Let's dare to make a garbage collect. */ - if ((result = jffs_garbage_collect_next(c)) < 0) { - printk(KERN_ERR "JFFS: Something " - "has gone seriously wrong " - "with a garbage collect: %d\n", result); - } - - gc_end: - D3(printk (KERN_NOTICE "g_c_thread(): up biglock\n")); - mutex_unlock(&fmc->biglock); - } /* for (;;) */ -} /* jffs_garbage_collect_thread() */ diff --git a/fs/jffs/intrep.h b/fs/jffs/intrep.h deleted file mode 100644 index 5c7abe0e269..00000000000 --- a/fs/jffs/intrep.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 1999, 2000 Axis Communications AB. - * - * Created by Finn Hakansson <finn@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: intrep.h,v 1.14 2001/09/23 23:28:37 dwmw2 Exp $ - * - */ - -#ifndef __LINUX_JFFS_INTREP_H__ -#define __LINUX_JFFS_INTREP_H__ -#include "jffs_fm.h" -struct jffs_node *jffs_alloc_node(void); -void jffs_free_node(struct jffs_node *n); -int jffs_get_node_inuse(void); - -void jffs_cleanup_control(struct jffs_control *c); -int jffs_build_fs(struct super_block *sb); - -int jffs_insert_node(struct jffs_control *c, struct jffs_file *f, - const struct jffs_raw_inode *raw_inode, - const char *name, struct jffs_node *node); -struct jffs_file *jffs_find_file(struct jffs_control *c, __u32 ino); -struct jffs_file *jffs_find_child(struct jffs_file *dir, const char *name, int len); - -void jffs_free_node(struct jffs_node *node); - -int jffs_foreach_file(struct jffs_control *c, int (*func)(struct jffs_file *)); -int jffs_possibly_delete_file(struct jffs_file *f); -int jffs_insert_file_into_tree(struct jffs_file *f); -int jffs_unlink_file_from_tree(struct jffs_file *f); -int jffs_file_count(struct jffs_file *f); - -int jffs_write_node(struct jffs_control *c, struct jffs_node *node, - struct jffs_raw_inode *raw_inode, - const char *name, const unsigned char *buf, - int recoverable, struct jffs_file *f); -int jffs_read_data(struct jffs_file *f, unsigned char *buf, __u32 read_offset, __u32 size); - -/* Garbage collection stuff. */ -int jffs_garbage_collect_thread(void *c); -void jffs_garbage_collect_trigger(struct jffs_control *c); - -/* For debugging purposes. */ -#if 0 -int jffs_print_file(struct jffs_file *f); -#endif /* 0 */ -void jffs_print_hash_table(struct jffs_control *c); -void jffs_print_tree(struct jffs_file *first_file, int indent); - -#endif /* __LINUX_JFFS_INTREP_H__ */ diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c deleted file mode 100644 index 5a95fbdd6fd..00000000000 --- a/fs/jffs/jffs_fm.c +++ /dev/null @@ -1,798 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 1999, 2000 Axis Communications AB. - * - * Created by Finn Hakansson <finn@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: jffs_fm.c,v 1.27 2001/09/20 12:29:47 dwmw2 Exp $ - * - * Ported to Linux 2.3.x and MTD: - * Copyright (C) 2000 Alexander Larsson (alex@cendio.se), Cendio Systems AB - * - */ -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/blkdev.h> -#include <linux/jffs.h> -#include "jffs_fm.h" -#include "intrep.h" - -#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE -static int jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset); -#endif - -static struct jffs_fm *jffs_alloc_fm(void); -static void jffs_free_fm(struct jffs_fm *n); - -extern struct kmem_cache *fm_cache; -extern struct kmem_cache *node_cache; - -#if CONFIG_JFFS_FS_VERBOSE > 0 -void -jffs_print_fmcontrol(struct jffs_fmcontrol *fmc) -{ - D(printk("struct jffs_fmcontrol: 0x%p\n", fmc)); - D(printk("{\n")); - D(printk(" %u, /* flash_size */\n", fmc->flash_size)); - D(printk(" %u, /* used_size */\n", fmc->used_size)); - D(printk(" %u, /* dirty_size */\n", fmc->dirty_size)); - D(printk(" %u, /* free_size */\n", fmc->free_size)); - D(printk(" %u, /* sector_size */\n", fmc->sector_size)); - D(printk(" %u, /* min_free_size */\n", fmc->min_free_size)); - D(printk(" %u, /* max_chunk_size */\n", fmc->max_chunk_size)); - D(printk(" 0x%p, /* mtd */\n", fmc->mtd)); - D(printk(" 0x%p, /* head */ " - "(head->offset = 0x%08x)\n", - fmc->head, (fmc->head ? fmc->head->offset : 0))); - D(printk(" 0x%p, /* tail */ " - "(tail->offset + tail->size = 0x%08x)\n", - fmc->tail, - (fmc->tail ? fmc->tail->offset + fmc->tail->size : 0))); - D(printk(" 0x%p, /* head_extra */\n", fmc->head_extra)); - D(printk(" 0x%p, /* tail_extra */\n", fmc->tail_extra)); - D(printk("}\n")); -} -#endif /* CONFIG_JFFS_FS_VERBOSE > 0 */ - -#if CONFIG_JFFS_FS_VERBOSE > 2 -static void -jffs_print_fm(struct jffs_fm *fm) -{ - D(printk("struct jffs_fm: 0x%p\n", fm)); - D(printk("{\n")); - D(printk(" 0x%08x, /* offset */\n", fm->offset)); - D(printk(" %u, /* size */\n", fm->size)); - D(printk(" 0x%p, /* prev */\n", fm->prev)); - D(printk(" 0x%p, /* next */\n", fm->next)); - D(printk(" 0x%p, /* nodes */\n", fm->nodes)); - D(printk("}\n")); -} -#endif /* CONFIG_JFFS_FS_VERBOSE > 2 */ - -#if 0 -void -jffs_print_node_ref(struct jffs_node_ref *ref) -{ - D(printk("struct jffs_node_ref: 0x%p\n", ref)); - D(printk("{\n")); - D(printk(" 0x%p, /* node */\n", ref->node)); - D(printk(" 0x%p, /* next */\n", ref->next)); - D(printk("}\n")); -} -#endif /* 0 */ - -/* This function creates a new shiny flash memory control structure. */ -struct jffs_fmcontrol * -jffs_build_begin(struct jffs_control *c, int unit) -{ - struct jffs_fmcontrol *fmc; - struct mtd_info *mtd; - - D3(printk("jffs_build_begin()\n")); - fmc = kmalloc(sizeof(*fmc), GFP_KERNEL); - if (!fmc) { - D(printk("jffs_build_begin(): Allocation of " - "struct jffs_fmcontrol failed!\n")); - return (struct jffs_fmcontrol *)0; - } - DJM(no_jffs_fmcontrol++); - - mtd = get_mtd_device(NULL, unit); - - if (IS_ERR(mtd)) { - kfree(fmc); - DJM(no_jffs_fmcontrol--); - return NULL; - } - - /* Retrieve the size of the flash memory. */ - fmc->flash_size = mtd->size; - D3(printk(" fmc->flash_size = %d bytes\n", fmc->flash_size)); - - fmc->used_size = 0; - fmc->dirty_size = 0; - fmc->free_size = mtd->size; - fmc->sector_size = mtd->erasesize; - fmc->max_chunk_size = fmc->sector_size >> 1; - /* min_free_size: - 1 sector, obviously. - + 1 x max_chunk_size, for when a nodes overlaps the end of a sector - + 1 x max_chunk_size again, which ought to be enough to handle - the case where a rename causes a name to grow, and GC has - to write out larger nodes than the ones it's obsoleting. - We should fix it so it doesn't have to write the name - _every_ time. Later. - + another 2 sectors because people keep getting GC stuck and - we don't know why. This scares me - I want formal proof - of correctness of whatever number we put here. dwmw2. - */ - fmc->min_free_size = fmc->sector_size << 2; - fmc->mtd = mtd; - fmc->c = c; - fmc->head = NULL; - fmc->tail = NULL; - fmc->head_extra = NULL; - fmc->tail_extra = NULL; - mutex_init(&fmc->biglock); - return fmc; -} - - -/* When the flash memory scan has completed, this function should be called - before use of the control structure. */ -void -jffs_build_end(struct jffs_fmcontrol *fmc) -{ - D3(printk("jffs_build_end()\n")); - - if (!fmc->head) { - fmc->head = fmc->head_extra; - fmc->tail = fmc->tail_extra; - } - else if (fmc->head_extra) { - fmc->tail_extra->next = fmc->head; - fmc->head->prev = fmc->tail_extra; - fmc->head = fmc->head_extra; - } - fmc->head_extra = NULL; /* These two instructions should be omitted. */ - fmc->tail_extra = NULL; - D3(jffs_print_fmcontrol(fmc)); -} - - -/* Call this function when the file system is unmounted. This function - frees all memory used by this module. */ -void -jffs_cleanup_fmcontrol(struct jffs_fmcontrol *fmc) -{ - if (fmc) { - struct jffs_fm *next = fmc->head; - while (next) { - struct jffs_fm *cur = next; - next = next->next; - jffs_free_fm(cur); - } - put_mtd_device(fmc->mtd); - kfree(fmc); - DJM(no_jffs_fmcontrol--); - } -} - - -/* This function returns the size of the first chunk of free space on the - flash memory. This function will return something nonzero if the flash - memory contains any free space. */ -__u32 -jffs_free_size1(struct jffs_fmcontrol *fmc) -{ - __u32 head; - __u32 tail; - __u32 end = fmc->flash_size; - - if (!fmc->head) { - /* There is nothing on the flash. */ - return fmc->flash_size; - } - - /* Compute the beginning and ending of the contents of the flash. */ - head = fmc->head->offset; - tail = fmc->tail->offset + fmc->tail->size; - if (tail == end) { - tail = 0; - } - ASSERT(else if (tail > end) { - printk(KERN_WARNING "jffs_free_size1(): tail > end\n"); - tail = 0; - }); - - if (head <= tail) { - return end - tail; - } - else { - return head - tail; - } -} - -/* This function will return something nonzero in case there are two free - areas on the flash. Like this: - - +----------------+------------------+----------------+ - | FREE 1 | USED / DIRTY | FREE 2 | - +----------------+------------------+----------------+ - fmc->head -----^ - fmc->tail ------------------------^ - - The value returned, will be the size of the first empty area on the - flash, in this case marked "FREE 1". */ -__u32 -jffs_free_size2(struct jffs_fmcontrol *fmc) -{ - if (fmc->head) { - __u32 head = fmc->head->offset; - __u32 tail = fmc->tail->offset + fmc->tail->size; - if (tail == fmc->flash_size) { - tail = 0; - } - - if (tail >= head) { - return head; - } - } - return 0; -} - - -/* Allocate a chunk of flash memory. If there is enough space on the - device, a reference to the associated node is stored in the jffs_fm - struct. */ -int -jffs_fmalloc(struct jffs_fmcontrol *fmc, __u32 size, struct jffs_node *node, - struct jffs_fm **result) -{ - struct jffs_fm *fm; - __u32 free_chunk_size1; - __u32 free_chunk_size2; - - D2(printk("jffs_fmalloc(): fmc = 0x%p, size = %d, " - "node = 0x%p\n", fmc, size, node)); - - *result = NULL; - - if (!(fm = jffs_alloc_fm())) { - D(printk("jffs_fmalloc(): kmalloc() failed! (fm)\n")); - return -ENOMEM; - } - - free_chunk_size1 = jffs_free_size1(fmc); - free_chunk_size2 = jffs_free_size2(fmc); - if (free_chunk_size1 + free_chunk_size2 != fmc->free_size) { - printk(KERN_WARNING "Free size accounting screwed\n"); - printk(KERN_WARNING "free_chunk_size1 == 0x%x, free_chunk_size2 == 0x%x, fmc->free_size == 0x%x\n", free_chunk_size1, free_chunk_size2, fmc->free_size); - } - - D3(printk("jffs_fmalloc(): free_chunk_size1 = %u, " - "free_chunk_size2 = %u\n", - free_chunk_size1, free_chunk_size2)); - - if (size <= free_chunk_size1) { - if (!(fm->nodes = (struct jffs_node_ref *) - kmalloc(sizeof(struct jffs_node_ref), - GFP_KERNEL))) { - D(printk("jffs_fmalloc(): kmalloc() failed! " - "(node_ref)\n")); - jffs_free_fm(fm); - return -ENOMEM; - } - DJM(no_jffs_node_ref++); - fm->nodes->node = node; - fm->nodes->next = NULL; - if (fmc->tail) { - fm->offset = fmc->tail->offset + fmc->tail->size; - if (fm->offset == fmc->flash_size) { - fm->offset = 0; - } - ASSERT(else if (fm->offset > fmc->flash_size) { - printk(KERN_WARNING "jffs_fmalloc(): " - "offset > flash_end\n"); - fm->offset = 0; - }); - } - else { - /* There don't have to be files in the file - system yet. */ - fm->offset = 0; - } - fm->size = size; - fmc->free_size -= size; - fmc->used_size += size; - } - else if (size > free_chunk_size2) { - printk(KERN_WARNING "JFFS: Tried to allocate a too " - "large flash memory chunk. (size = %u)\n", size); - jffs_free_fm(fm); - return -ENOSPC; - } - else { - fm->offset = fmc->tail->offset + fmc->tail->size; - fm->size = free_chunk_size1; - fm->nodes = NULL; - fmc->free_size -= fm->size; - fmc->dirty_size += fm->size; /* Changed by simonk. This seemingly fixes a - bug that caused infinite garbage collection. - It previously set fmc->dirty_size to size (which is the - size of the requested chunk). - */ - } - - fm->next = NULL; - if (!fmc->head) { - fm->prev = NULL; - fmc->head = fm; - fmc->tail = fm; - } - else { - fm->prev = fmc->tail; - fmc->tail->next = fm; - fmc->tail = fm; - } - - D3(jffs_print_fmcontrol(fmc)); - D3(jffs_print_fm(fm)); - *result = fm; - return 0; -} - - -/* The on-flash space is not needed anymore by the passed node. Remove - the reference to the node from the node list. If the data chunk in - the flash memory isn't used by any more nodes anymore (fm->nodes == 0), - then mark that chunk as dirty. */ -int -jffs_fmfree(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, struct jffs_node *node) -{ - struct jffs_node_ref *ref; - struct jffs_node_ref *prev; - ASSERT(int del = 0); - - D2(printk("jffs_fmfree(): node->ino = %u, node->version = %u\n", - node->ino, node->version)); - - ASSERT(if (!fmc || !fm || !fm->nodes) { - printk(KERN_ERR "jffs_fmfree(): fmc: 0x%p, fm: 0x%p, " - "fm->nodes: 0x%p\n", - fmc, fm, (fm ? fm->nodes : NULL)); - return -1; - }); - - /* Find the reference to the node that is going to be removed - and remove it. */ - for (ref = fm->nodes, prev = NULL; ref; ref = ref->next) { - if (ref->node == node) { - if (prev) { - prev->next = ref->next; - } - else { - fm->nodes = ref->next; - } - kfree(ref); - DJM(no_jffs_node_ref--); - ASSERT(del = 1); - break; - } - prev = ref; - } - - /* If the data chunk in the flash memory isn't used anymore - just mark it as obsolete. */ - if (!fm->nodes) { - /* No node uses this chunk so let's remove it. */ - fmc->used_size -= fm->size; - fmc->dirty_size += fm->size; -#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE - if (jffs_mark_obsolete(fmc, fm->offset) < 0) { - D1(printk("jffs_fmfree(): Failed to mark an on-flash " - "node obsolete!\n")); - return -1; - } -#endif - } - - ASSERT(if (!del) { - printk(KERN_WARNING "***jffs_fmfree(): " - "Didn't delete any node reference!\n"); - }); - - return 0; -} - - -/* This allocation function is used during the initialization of - the file system. */ -struct jffs_fm * -jffs_fmalloced(struct jffs_fmcontrol *fmc, __u32 offset, __u32 size, - struct jffs_node *node) -{ - struct jffs_fm *fm; - - D3(printk("jffs_fmalloced()\n")); - - if (!(fm = jffs_alloc_fm())) { - D(printk("jffs_fmalloced(0x%p, %u, %u, 0x%p): failed!\n", - fmc, offset, size, node)); - return NULL; - } - fm->offset = offset; - fm->size = size; - fm->prev = NULL; - fm->next = NULL; - fm->nodes = NULL; - if (node) { - /* `node' exists and it should be associated with the - jffs_fm structure `fm'. */ - if (!(fm->nodes = (struct jffs_node_ref *) - kmalloc(sizeof(struct jffs_node_ref), - GFP_KERNEL))) { - D(printk("jffs_fmalloced(): !fm->nodes\n")); - jffs_free_fm(fm); - return NULL; - } - DJM(no_jffs_node_ref++); - fm->nodes->node = node; - fm->nodes->next = NULL; - fmc->used_size += size; - fmc->free_size -= size; - } - else { - /* If there is no node, then this is just a chunk of dirt. */ - fmc->dirty_size += size; - fmc->free_size -= size; - } - - if (fmc->head_extra) { - fm->prev = fmc->tail_extra; - fmc->tail_extra->next = fm; - fmc->tail_extra = fm; - } - else if (!fmc->head) { - fmc->head = fm; - fmc->tail = fm; - } - else if (fmc->tail->offset + fmc->tail->size < offset) { - fmc->head_extra = fm; - fmc->tail_extra = fm; - } - else { - fm->prev = fmc->tail; - fmc->tail->next = fm; - fmc->tail = fm; - } - D3(jffs_print_fmcontrol(fmc)); - D3(jffs_print_fm(fm)); - return fm; -} - - -/* Add a new node to an already existing jffs_fm struct. */ -int -jffs_add_node(struct jffs_node *node) -{ - struct jffs_node_ref *ref; - - D3(printk("jffs_add_node(): ino = %u\n", node->ino)); - - ref = kmalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) - return -ENOMEM; - - DJM(no_jffs_node_ref++); - ref->node = node; - ref->next = node->fm->nodes; - node->fm->nodes = ref; - return 0; -} - - -/* Free a part of some allocated space. */ -void -jffs_fmfree_partly(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, __u32 size) -{ - D1(printk("***jffs_fmfree_partly(): fm = 0x%p, fm->nodes = 0x%p, " - "fm->nodes->node->ino = %u, size = %u\n", - fm, (fm ? fm->nodes : 0), - (!fm ? 0 : (!fm->nodes ? 0 : fm->nodes->node->ino)), size)); - - if (fm->nodes) { - kfree(fm->nodes); - DJM(no_jffs_node_ref--); - fm->nodes = NULL; - } - fmc->used_size -= fm->size; - if (fm == fmc->tail) { - fm->size -= size; - fmc->free_size += size; - } - fmc->dirty_size += fm->size; -} - - -/* Find the jffs_fm struct that contains the end of the data chunk that - begins at the logical beginning of the flash memory and spans `size' - bytes. If we want to erase a sector of the flash memory, we use this - function to find where the sector limit cuts a chunk of data. */ -struct jffs_fm * -jffs_cut_node(struct jffs_fmcontrol *fmc, __u32 size) -{ - struct jffs_fm *fm; - __u32 pos = 0; - - if (size == 0) { - return NULL; - } - - ASSERT(if (!fmc) { - printk(KERN_ERR "jffs_cut_node(): fmc == NULL\n"); - return NULL; - }); - - fm = fmc->head; - - while (fm) { - pos += fm->size; - if (pos < size) { - fm = fm->next; - } - else if (pos > size) { - break; - } - else { - fm = NULL; - break; - } - } - - return fm; -} - - -/* Move the head of the fmc structures and delete the obsolete parts. */ -void -jffs_sync_erase(struct jffs_fmcontrol *fmc, int erased_size) -{ - struct jffs_fm *fm; - struct jffs_fm *del; - - ASSERT(if (!fmc) { - printk(KERN_ERR "jffs_sync_erase(): fmc == NULL\n"); - return; - }); - - fmc->dirty_size -= erased_size; - fmc->free_size += erased_size; - - for (fm = fmc->head; fm && (erased_size > 0);) { - if (erased_size >= fm->size) { - erased_size -= fm->size; - del = fm; - fm = fm->next; - fm->prev = NULL; - fmc->head = fm; - jffs_free_fm(del); - } - else { - fm->size -= erased_size; - fm->offset += erased_size; - break; - } - } -} - - -/* Return the oldest used node in the flash memory. */ -struct jffs_node * -jffs_get_oldest_node(struct jffs_fmcontrol *fmc) -{ - struct jffs_fm *fm; - struct jffs_node_ref *nref; - struct jffs_node *node = NULL; - - ASSERT(if (!fmc) { - printk(KERN_ERR "jffs_get_oldest_node(): fmc == NULL\n"); - return NULL; - }); - - for (fm = fmc->head; fm && !fm->nodes; fm = fm->next); - - if (!fm) { - return NULL; - } - - /* The oldest node is the last one in the reference list. This list - shouldn't be too long; just one or perhaps two elements. */ - for (nref = fm->nodes; nref; nref = nref->next) { - node = nref->node; - } - - D2(printk("jffs_get_oldest_node(): ino = %u, version = %u\n", - (node ? node->ino : 0), (node ? node->version : 0))); - - return node; -} - - -#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE - -/* Mark an on-flash node as obsolete. - - Note that this is just an optimization that isn't necessary for the - filesystem to work. */ - -static int -jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset) -{ - /* The `accurate_pos' holds the position of the accurate byte - in the jffs_raw_inode structure that we are going to mark - as obsolete. */ - __u32 accurate_pos = fm_offset + JFFS_RAW_INODE_ACCURATE_OFFSET; - unsigned char zero = 0x00; - size_t len; - - D3(printk("jffs_mark_obsolete(): accurate_pos = %u\n", accurate_pos)); - ASSERT(if (!fmc) { - printk(KERN_ERR "jffs_mark_obsolete(): fmc == NULL\n"); - return -1; - }); - - /* Write 0x00 to the raw inode's accurate member. Don't care - about the return value. */ - MTD_WRITE(fmc->mtd, accurate_pos, 1, &len, &zero); - return 0; -} - -#endif /* JFFS_MARK_OBSOLETE */ - -/* check if it's possible to erase the wanted range, and if not, return - * the range that IS erasable, or a negative error code. - */ -static long -jffs_flash_erasable_size(struct mtd_info *mtd, __u32 offset, __u32 size) -{ - u_long ssize; - - /* assume that sector size for a partition is constant even - * if it spans more than one chip (you usually put the same - * type of chips in a system) - */ - - ssize = mtd->erasesize; - - if (offset % ssize) { - printk(KERN_WARNING "jffs_flash_erasable_size() given non-aligned offset %x (erasesize %lx)\n", offset, ssize); - /* The offset is not sector size aligned. */ - return -1; - } - else if (offset > mtd->size) { - printk(KERN_WARNING "jffs_flash_erasable_size given offset off the end of device (%x > %x)\n", offset, mtd->size); - return -2; - } - else if (offset + size > mtd->size) { - printk(KERN_WARNING "jffs_flash_erasable_size() given length which runs off the end of device (ofs %x + len %x = %x, > %x)\n", offset,size, offset+size, mtd->size); - return -3; - } - - return (size / ssize) * ssize; -} - - -/* How much dirty flash memory is possible to erase at the moment? */ -long -jffs_erasable_size(struct jffs_fmcontrol *fmc) -{ - struct jffs_fm *fm; - __u32 size = 0; - long ret; - - ASSERT(if (!fmc) { - printk(KERN_ERR "jffs_erasable_size(): fmc = NULL\n"); - return -1; - }); - - if (!fmc->head) { - /* The flash memory is totally empty. No nodes. No dirt. - Just return. */ - return 0; - } - - /* Calculate how much space that is dirty. */ - for (fm = fmc->head; fm && !fm->nodes; fm = fm->next) { - if (size && fm->offset == 0) { - /* We have reached the beginning of the flash. */ - break; - } - size += fm->size; - } - - /* Someone's signature contained this: - There's a fine line between fishing and just standing on - the shore like an idiot... */ - ret = jffs_flash_erasable_size(fmc->mtd, fmc->head->offset, size); - - ASSERT(if (ret < 0) { - printk("jffs_erasable_size: flash_erasable_size() " - "returned something less than zero (%ld).\n", ret); - printk("jffs_erasable_size: offset = 0x%08x\n", - fmc->head->offset); - }); - - /* If there is dirt on the flash (which is the reason to why - this function was called in the first place) but no space is - possible to erase right now, the initial part of the list of - jffs_fm structs, that hold place for dirty space, could perhaps - be shortened. The list's initial "dirty" elements are merged - into just one large dirty jffs_fm struct. This operation must - only be performed if nothing is possible to erase. Otherwise, - jffs_clear_end_of_node() won't work as expected. */ - if (ret == 0) { - struct jffs_fm *head = fmc->head; - struct jffs_fm *del; - /* While there are two dirty nodes beside each other.*/ - while (head->nodes == 0 - && head->next - && head->next->nodes == 0) { - del = head->next; - head->size += del->size; - head->next = del->next; - if (del->next) { - del->next->prev = head; - } - jffs_free_fm(del); - } - } - - return (ret >= 0 ? ret : 0); -} - -static struct jffs_fm *jffs_alloc_fm(void) -{ - struct jffs_fm *fm; - - fm = kmem_cache_alloc(fm_cache,GFP_KERNEL); - DJM(if (fm) no_jffs_fm++;); - - return fm; -} - -static void jffs_free_fm(struct jffs_fm *n) -{ - kmem_cache_free(fm_cache,n); - DJM(no_jffs_fm--); -} - - - -struct jffs_node *jffs_alloc_node(void) -{ - struct jffs_node *n; - - n = (struct jffs_node *)kmem_cache_alloc(node_cache,GFP_KERNEL); - if(n != NULL) - no_jffs_node++; - return n; -} - -void jffs_free_node(struct jffs_node *n) -{ - kmem_cache_free(node_cache,n); - no_jffs_node--; -} - - -int jffs_get_node_inuse(void) -{ - return no_jffs_node; -} diff --git a/fs/jffs/jffs_fm.h b/fs/jffs/jffs_fm.h deleted file mode 100644 index 9ee6ad29eff..00000000000 --- a/fs/jffs/jffs_fm.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 1999, 2000 Axis Communications AB. - * - * Created by Finn Hakansson <finn@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: jffs_fm.h,v 1.13 2001/01/11 12:03:25 dwmw2 Exp $ - * - * Ported to Linux 2.3.x and MTD: - * Copyright (C) 2000 Alexander Larsson (alex@cendio.se), Cendio Systems AB - * - */ - -#ifndef __LINUX_JFFS_FM_H__ -#define __LINUX_JFFS_FM_H__ - -#include <linux/types.h> -#include <linux/jffs.h> -#include <linux/mtd/mtd.h> -#include <linux/mutex.h> - -/* The alignment between two nodes in the flash memory. */ -#define JFFS_ALIGN_SIZE 4 - -/* Mark the on-flash space as obsolete when appropriate. */ -#define JFFS_MARK_OBSOLETE 0 - -#ifndef CONFIG_JFFS_FS_VERBOSE -#define CONFIG_JFFS_FS_VERBOSE 1 -#endif - -#if CONFIG_JFFS_FS_VERBOSE > 0 -#define D(x) x -#define D1(x) D(x) -#else -#define D(x) -#define D1(x) -#endif - -#if CONFIG_JFFS_FS_VERBOSE > 1 -#define D2(x) D(x) -#else -#define D2(x) -#endif - -#if CONFIG_JFFS_FS_VERBOSE > 2 -#define D3(x) D(x) -#else -#define D3(x) -#endif - -#define ASSERT(x) x - -/* How many padding bytes should be inserted between two chunks of data - on the flash? */ -#define JFFS_GET_PAD_BYTES(size) ( (JFFS_ALIGN_SIZE-1) & -(__u32)(size) ) -#define JFFS_PAD(size) ( (size + (JFFS_ALIGN_SIZE-1)) & ~(JFFS_ALIGN_SIZE-1) ) - - - -struct jffs_node_ref -{ - struct jffs_node *node; - struct jffs_node_ref *next; -}; - - -/* The struct jffs_fm represents a chunk of data in the flash memory. */ -struct jffs_fm -{ - __u32 offset; - __u32 size; - struct jffs_fm *prev; - struct jffs_fm *next; - struct jffs_node_ref *nodes; /* USED if != 0. */ -}; - -struct jffs_fmcontrol -{ - __u32 flash_size; - __u32 used_size; - __u32 dirty_size; - __u32 free_size; - __u32 sector_size; - __u32 min_free_size; /* The minimum free space needed to be able - to perform garbage collections. */ - __u32 max_chunk_size; /* The maximum size of a chunk of data. */ - struct mtd_info *mtd; - struct jffs_control *c; - struct jffs_fm *head; - struct jffs_fm *tail; - struct jffs_fm *head_extra; - struct jffs_fm *tail_extra; - struct mutex biglock; -}; - -/* Notice the two members head_extra and tail_extra in the jffs_control - structure above. Those are only used during the scanning of the flash - memory; while the file system is being built. If the data in the flash - memory is organized like - - +----------------+------------------+----------------+ - | USED / DIRTY | FREE | USED / DIRTY | - +----------------+------------------+----------------+ - - then the scan is split in two parts. The first scanned part of the - flash memory is organized through the members head and tail. The - second scanned part is organized with head_extra and tail_extra. When - the scan is completed, the two lists are merged together. The jffs_fm - struct that head_extra references is the logical beginning of the - flash memory so it will be referenced by the head member. */ - - - -struct jffs_fmcontrol *jffs_build_begin(struct jffs_control *c, int unit); -void jffs_build_end(struct jffs_fmcontrol *fmc); -void jffs_cleanup_fmcontrol(struct jffs_fmcontrol *fmc); - -int jffs_fmalloc(struct jffs_fmcontrol *fmc, __u32 size, - struct jffs_node *node, struct jffs_fm **result); -int jffs_fmfree(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, - struct jffs_node *node); - -__u32 jffs_free_size1(struct jffs_fmcontrol *fmc); -__u32 jffs_free_size2(struct jffs_fmcontrol *fmc); -void jffs_sync_erase(struct jffs_fmcontrol *fmc, int erased_size); -struct jffs_fm *jffs_cut_node(struct jffs_fmcontrol *fmc, __u32 size); -struct jffs_node *jffs_get_oldest_node(struct jffs_fmcontrol *fmc); -long jffs_erasable_size(struct jffs_fmcontrol *fmc); -struct jffs_fm *jffs_fmalloced(struct jffs_fmcontrol *fmc, __u32 offset, - __u32 size, struct jffs_node *node); -int jffs_add_node(struct jffs_node *node); -void jffs_fmfree_partly(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, - __u32 size); - -#if CONFIG_JFFS_FS_VERBOSE > 0 -void jffs_print_fmcontrol(struct jffs_fmcontrol *fmc); -#endif -#if 0 -void jffs_print_node_ref(struct jffs_node_ref *ref); -#endif /* 0 */ - -#endif /* __LINUX_JFFS_FM_H__ */ diff --git a/fs/jffs/jffs_proc.c b/fs/jffs/jffs_proc.c deleted file mode 100644 index 9bdd99a557c..00000000000 --- a/fs/jffs/jffs_proc.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 2000 Axis Communications AB. - * - * Created by Simon Kagstrom <simonk@axis.com>. - * - * $Id: jffs_proc.c,v 1.5 2001/06/02 14:34:55 dwmw2 Exp $ - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * Overview: - * This file defines JFFS partition entries in the proc file system. - * - * TODO: - * Create some more proc files for different kinds of info, i.e. statistics - * about written and read bytes, number of calls to different routines, - * reports about failures. - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/jffs.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/time.h> -#include <linux/types.h> -#include "jffs_fm.h" -#include "jffs_proc.h" - -/* - * Structure for a JFFS partition in the system - */ -struct jffs_partition_dir { - struct jffs_control *c; - struct proc_dir_entry *part_root; - struct proc_dir_entry *part_info; - struct proc_dir_entry *part_layout; - struct jffs_partition_dir *next; -}; - -/* - * Structure for top-level entry in '/proc/fs' directory - */ -struct proc_dir_entry *jffs_proc_root; - -/* - * Linked list of 'jffs_partition_dirs' to help us track - * the mounted JFFS partitions in the system - */ -static struct jffs_partition_dir *jffs_part_dirs; - -/* - * Read functions for entries - */ -static int jffs_proc_info_read(char *page, char **start, off_t off, - int count, int *eof, void *data); -static int jffs_proc_layout_read (char *page, char **start, off_t off, - int count, int *eof, void *data); - - -/* - * Register a JFFS partition directory (called upon mount) - */ -int jffs_register_jffs_proc_dir(int mtd, struct jffs_control *c) -{ - struct jffs_partition_dir *part_dir; - struct proc_dir_entry *part_info = NULL; - struct proc_dir_entry *part_layout = NULL; - struct proc_dir_entry *part_root = NULL; - char name[10]; - - sprintf(name, "%d", mtd); - /* Allocate structure for local JFFS partition table */ - part_dir = (struct jffs_partition_dir *) - kmalloc(sizeof (struct jffs_partition_dir), GFP_KERNEL); - if (!part_dir) - goto out; - - /* Create entry for this partition */ - part_root = proc_mkdir(name, jffs_proc_root); - if (!part_root) - goto out1; - - /* Create entry for 'info' file */ - part_info = create_proc_entry ("info", 0, part_root); - if (!part_info) - goto out2; - part_info->read_proc = jffs_proc_info_read; - part_info->data = (void *) c; - - /* Create entry for 'layout' file */ - part_layout = create_proc_entry ("layout", 0, part_root); - if (!part_layout) - goto out3; - part_layout->read_proc = jffs_proc_layout_read; - part_layout->data = (void *) c; - - /* Fill in structure for table and insert in the list */ - part_dir->c = c; - part_dir->part_root = part_root; - part_dir->part_info = part_info; - part_dir->part_layout = part_layout; - part_dir->next = jffs_part_dirs; - jffs_part_dirs = part_dir; - - /* Return happy */ - return 0; - -out3: - remove_proc_entry("info", part_root); -out2: - remove_proc_entry(name, jffs_proc_root); -out1: - kfree(part_dir); -out: - return -ENOMEM; -} - - -/* - * Unregister a JFFS partition directory (called at umount) - */ -int jffs_unregister_jffs_proc_dir(struct jffs_control *c) -{ - struct jffs_partition_dir *part_dir = jffs_part_dirs; - struct jffs_partition_dir *prev_part_dir = NULL; - - while (part_dir) { - if (part_dir->c == c) { - /* Remove entries for partition */ - remove_proc_entry (part_dir->part_info->name, - part_dir->part_root); - remove_proc_entry (part_dir->part_layout->name, - part_dir->part_root); - remove_proc_entry (part_dir->part_root->name, - jffs_proc_root); - - /* Remove entry from list */ - if (prev_part_dir) - prev_part_dir->next = part_dir->next; - else - jffs_part_dirs = part_dir->next; - - /* - * Check to see if this is the last one - * and remove the entry from '/proc/fs' - * if it is. - */ - if (jffs_part_dirs == part_dir->next) - remove_proc_entry ("jffs", proc_root_fs); - - /* Free memory for entry */ - kfree(part_dir); - - /* Return happy */ - return 0; - } - - /* Move to next entry */ - prev_part_dir = part_dir; - part_dir = part_dir->next; - } - - /* Return unhappy */ - return -1; -} - - -/* - * Read a JFFS partition's `info' file - */ -static int jffs_proc_info_read (char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct jffs_control *c = (struct jffs_control *) data; - int len = 0; - - /* Get information on the parition */ - len += sprintf (page, - "partition size: %08lX (%u)\n" - "sector size: %08lX (%u)\n" - "used size: %08lX (%u)\n" - "dirty size: %08lX (%u)\n" - "free size: %08lX (%u)\n\n", - (unsigned long) c->fmc->flash_size, c->fmc->flash_size, - (unsigned long) c->fmc->sector_size, c->fmc->sector_size, - (unsigned long) c->fmc->used_size, c->fmc->used_size, - (unsigned long) c->fmc->dirty_size, c->fmc->dirty_size, - (unsigned long) (c->fmc->flash_size - - (c->fmc->used_size + c->fmc->dirty_size)), - c->fmc->flash_size - (c->fmc->used_size + c->fmc->dirty_size)); - - /* We're done */ - *eof = 1; - - /* Return length */ - return len; -} - - -/* - * Read a JFFS partition's `layout' file - */ -static int jffs_proc_layout_read (char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct jffs_control *c = (struct jffs_control *) data; - struct jffs_fm *fm = NULL; - struct jffs_fm *last_fm = NULL; - int len = 0; - - /* Get the first item in the list */ - fm = c->fmc->head; - - /* Print free space */ - if (fm && fm->offset) { - len += sprintf (page, "00000000 %08lX free\n", - (unsigned long) fm->offset); - } - - /* Loop through all of the flash control structures */ - while (fm && (len < (off + count))) { - if (fm->nodes) { - len += sprintf (page + len, - "%08lX %08lX ino=%08lX, ver=%08lX\n", - (unsigned long) fm->offset, - (unsigned long) fm->size, - (unsigned long) fm->nodes->node->ino, - (unsigned long) fm->nodes->node->version); - } - else { - len += sprintf (page + len, - "%08lX %08lX dirty\n", - (unsigned long) fm->offset, - (unsigned long) fm->size); - } - last_fm = fm; - fm = fm->next; - } - - /* Print free space */ - if ((len < (off + count)) && last_fm - && (last_fm->offset < c->fmc->flash_size)) { - len += sprintf (page + len, - "%08lX %08lX free\n", - (unsigned long) last_fm->offset + - last_fm->size, - (unsigned long) (c->fmc->flash_size - - (last_fm->offset + last_fm->size))); - } - - /* We're done */ - *eof = 1; - - /* Return length */ - return len; -} diff --git a/fs/jffs/jffs_proc.h b/fs/jffs/jffs_proc.h deleted file mode 100644 index 39a1c5d162b..00000000000 --- a/fs/jffs/jffs_proc.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * JFFS -- Journaling Flash File System, Linux implementation. - * - * Copyright (C) 2000 Axis Communications AB. - * - * Created by Simon Kagstrom <simonk@axis.com>. - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * $Id: jffs_proc.h,v 1.2 2000/11/15 22:04:12 sjhill Exp $ - */ - -/* jffs_proc.h defines a structure for inclusion in the proc-file system. */ -#ifndef __LINUX_JFFS_PROC_H__ -#define __LINUX_JFFS_PROC_H__ - -#include <linux/proc_fs.h> - -/* The proc_dir_entry for jffs (defined in jffs_proc.c). */ -extern struct proc_dir_entry *jffs_proc_root; - -int jffs_register_jffs_proc_dir(int mtd, struct jffs_control *c); -int jffs_unregister_jffs_proc_dir(struct jffs_control *c); - -#endif /* __LINUX_JFFS_PROC_H__ */ diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 02826967ab5..07119c42a86 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -348,23 +348,27 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) ret = jffs2_sum_init(c); if (ret) - return ret; + goto out_free; if (jffs2_build_filesystem(c)) { dbg_fsbuild("build_fs failed\n"); jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); - - return -EIO; + ret = -EIO; + goto out_free; } jffs2_calc_trigger_levels(c); return 0; + + out_free: +#ifndef __ECOS + if (jffs2_blocks_use_vmalloc(c)) + vfree(c->blocks); + else +#endif + kfree(c->blocks); + + return ret; } diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 3681d0728ac..0c1fc6e20b4 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -16,7 +16,6 @@ #endif #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index da6034d5071..9fa2e27f064 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -13,7 +13,6 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/sched.h> #include <linux/fs.h> #include <linux/crc32.h> #include <linux/jffs2.h> @@ -46,7 +45,7 @@ const struct file_operations jffs2_dir_operations = }; -struct inode_operations jffs2_dir_inode_operations = +const struct inode_operations jffs2_dir_inode_operations = { .create = jffs2_create, .lookup = jffs2_lookup, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 242875f77cb..e82eeaf7590 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -54,7 +54,7 @@ const struct file_operations jffs2_file_operations = /* jffs2_file_inode_operations */ -struct inode_operations jffs2_file_inode_operations = +const struct inode_operations jffs2_file_inode_operations = { .permission = jffs2_permission, .setattr = jffs2_setattr, diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index b98594992ee..ea88f69af13 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -98,20 +98,14 @@ struct jffs2_sb_info { uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */ #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - /* Write-behind buffer for NAND flash */ - unsigned char *wbuf; - unsigned char *oobbuf; + unsigned char *wbuf; /* Write-behind buffer for NAND flash */ uint32_t wbuf_ofs; uint32_t wbuf_len; struct jffs2_inodirty *wbuf_inodes; - struct rw_semaphore wbuf_sem; /* Protects the write buffer */ - /* Information about out-of-band area usage... */ - struct nand_ecclayout *ecclayout; - uint32_t badblock_pos; - uint32_t fsdata_pos; - uint32_t fsdata_len; + unsigned char *oobbuf; + int oobavail; /* How many bytes are available for JFFS2 in OOB */ #endif struct jffs2_summary *summary; /* Summary information */ diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 9f41fc01a37..e07a0edcdb4 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -153,11 +153,11 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c); /* dir.c */ extern const struct file_operations jffs2_dir_operations; -extern struct inode_operations jffs2_dir_inode_operations; +extern const struct inode_operations jffs2_dir_inode_operations; /* file.c */ extern const struct file_operations jffs2_file_operations; -extern struct inode_operations jffs2_file_inode_operations; +extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; int jffs2_fsync(struct file *, struct dentry *, int); int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); @@ -166,7 +166,7 @@ int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long); /* symlink.c */ -extern struct inode_operations jffs2_symlink_inode_operations; +extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ int jffs2_setattr (struct dentry *, struct iattr *); diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 3af746eaff0..31c1475d922 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -450,16 +450,20 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo #ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { - int ret = jffs2_check_nand_cleanmarker(c, jeb); + int ret; + + if (c->mtd->block_isbad(c->mtd, jeb->offset)) + return BLK_STATE_BADBLOCK; + + ret = jffs2_check_nand_cleanmarker(c, jeb); D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret)); + /* Even if it's not found, we still scan to see if the block is empty. We use this information to decide whether to erase it or not. */ switch (ret) { case 0: cleanmarkerfound = 1; break; case 1: break; - case 2: return BLK_STATE_BADBLOCK; - case 3: return BLK_STATE_ALLDIRTY; /* Block has failed to erase min. once */ default: return ret; } } diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 25265965bdc..30f888414ce 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -14,7 +14,6 @@ */ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/mtd/mtd.h> #include <linux/pagemap.h> diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 08a0e6c49e6..cc7e8e71ad4 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -66,7 +66,7 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) return 0; } -static struct super_operations jffs2_super_operations = +static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, .destroy_inode =jffs2_destroy_inode, diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b90d5aa3d96..7e4882c8a7e 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -20,7 +20,7 @@ static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); -struct inode_operations jffs2_symlink_inode_operations = +const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 9c99859f5ed..de718e3a169 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -957,43 +957,48 @@ exit: return ret; } -#define NR_OOB_SCAN_PAGES 4 +#define NR_OOB_SCAN_PAGES 4 + +/* For historical reasons we use only 12 bytes for OOB clean marker */ +#define OOB_CM_SIZE 12 + +static const struct jffs2_unknown_node oob_cleanmarker = +{ + .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), + .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), + .totlen = cpu_to_je32(8) +}; /* - * Check, if the out of band area is empty + * Check, if the out of band area is empty. This function knows about the clean + * marker and if it is present in OOB, treats the OOB as empty anyway. */ int jffs2_check_oob_empty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int mode) { - int i, page, ret; - int oobsize = c->mtd->oobsize; + int i, ret; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); struct mtd_oob_ops ops; - ops.ooblen = NR_OOB_SCAN_PAGES * oobsize; + ops.mode = MTD_OOB_AUTO; + ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; ops.oobbuf = c->oobbuf; - ops.ooboffs = 0; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_PLACE; ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); - if (ret) { - D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB " - "failed %d for block at %08x\n", ret, jeb->offset)); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; return ret; } - if (ops.oobretlen < ops.ooblen) { - D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB " - "returned short read (%zd bytes not %d) for block " - "at %08x\n", ops.oobretlen, ops.ooblen, jeb->offset)); - return -EIO; - } - - /* Special check for first page */ - for(i = 0; i < oobsize ; i++) { - /* Yeah, we know about the cleanmarker. */ - if (mode && i >= c->fsdata_pos && - i < c->fsdata_pos + c->fsdata_len) + for(i = 0; i < ops.ooblen; i++) { + if (mode && i < cmlen) + /* Yeah, we know about the cleanmarker */ continue; if (ops.oobbuf[i] != 0xFF) { @@ -1003,111 +1008,63 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, } } - /* we know, we are aligned :) */ - for (page = oobsize; page < ops.ooblen; page += sizeof(long)) { - long dat = *(long *)(&ops.oobbuf[page]); - if(dat != -1) - return 1; - } return 0; } /* - * Scan for a valid cleanmarker and for bad blocks + * Check for a valid cleanmarker. + * Returns: 0 if a valid cleanmarker was found + * 1 if no cleanmarker was found + * negative error code if an error occurred */ -int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, - struct jffs2_eraseblock *jeb) +int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, + struct jffs2_eraseblock *jeb) { - struct jffs2_unknown_node n; struct mtd_oob_ops ops; - int oobsize = c->mtd->oobsize; - unsigned char *p,*b; - int i, ret; - size_t offset = jeb->offset; - - /* Check first if the block is bad. */ - if (c->mtd->block_isbad(c->mtd, offset)) { - D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()" - ": Bad block at %08x\n", jeb->offset)); - return 2; - } + int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.ooblen = oobsize; + ops.mode = MTD_OOB_AUTO; + ops.ooblen = cmlen; ops.oobbuf = c->oobbuf; - ops.ooboffs = 0; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_PLACE; - ret = c->mtd->read_oob(c->mtd, offset, &ops); - if (ret) { - D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): " - "Read OOB failed %d for block at %08x\n", - ret, jeb->offset)); + ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; return ret; } - if (ops.oobretlen < ops.ooblen) { - D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): " - "Read OOB return short read (%zd bytes not %d) " - "for block at %08x\n", ops.oobretlen, ops.ooblen, - jeb->offset)); - return -EIO; - } - - n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK); - n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER); - n.totlen = cpu_to_je32 (8); - p = (unsigned char *) &n; - b = c->oobbuf + c->fsdata_pos; - - for (i = c->fsdata_len; i; i--) { - if (*b++ != *p++) - ret = 1; - } - - D1(if (ret == 1) { - printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): " - "Cleanmarker node not detected in block at %08x\n", - offset); - printk(KERN_WARNING "OOB at %08zx was ", offset); - for (i=0; i < oobsize; i++) - printk("%02x ", c->oobbuf[i]); - printk("\n"); - }); - return ret; + return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen); } int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { - struct jffs2_unknown_node n; - int ret; + int ret; struct mtd_oob_ops ops; + int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); - n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER); - n.totlen = cpu_to_je32(8); - - ops.ooblen = c->fsdata_len; - ops.oobbuf = (uint8_t *)&n; - ops.ooboffs = c->fsdata_pos; + ops.mode = MTD_OOB_AUTO; + ops.ooblen = cmlen; + ops.oobbuf = (uint8_t *)&oob_cleanmarker; + ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ops.mode = MTD_OOB_PLACE; ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops); - - if (ret) { - D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): " - "Write failed for block at %08x: error %d\n", - jeb->offset, ret)); + if (ret || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); + if (!ret) + ret = -EIO; return ret; } - if (ops.oobretlen != ops.ooblen) { - D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): " - "Short write for block at %08x: %zd not %d\n", - jeb->offset, ops.oobretlen, ops.ooblen)); - return -EIO; - } + return 0; } @@ -1140,41 +1097,24 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock * return 1; } -static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c) +int jffs2_nand_flash_setup(struct jffs2_sb_info *c) { struct nand_ecclayout *oinfo = c->mtd->ecclayout; - /* Do this only, if we have an oob buffer */ if (!c->mtd->oobsize) return 0; /* Cleanmarker is out-of-band, so inline size zero */ c->cleanmarker_size = 0; - /* Should we use autoplacement ? */ - if (!oinfo) { - D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n")); + if (!oinfo || oinfo->oobavail == 0) { + printk(KERN_ERR "inconsistent device description\n"); return -EINVAL; } - D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n")); - /* Get the position of the free bytes */ - if (!oinfo->oobfree[0].length) { - printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep." - " Autoplacement selected and no empty space in oob\n"); - return -ENOSPC; - } - c->fsdata_pos = oinfo->oobfree[0].offset; - c->fsdata_len = oinfo->oobfree[0].length; - if (c->fsdata_len > 8) - c->fsdata_len = 8; + D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n")); - return 0; -} - -int jffs2_nand_flash_setup(struct jffs2_sb_info *c) -{ - int res; + c->oobavail = oinfo->oobavail; /* Initialise write buffer */ init_rwsem(&c->wbuf_sem); @@ -1185,22 +1125,13 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c) if (!c->wbuf) return -ENOMEM; - c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL); - if (!c->oobbuf) - return -ENOMEM; - - res = jffs2_nand_set_oobinfo(c); - -#ifdef BREAKME - if (!brokenbuf) - brokenbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); - if (!brokenbuf) { + c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL); + if (!c->oobbuf) { kfree(c->wbuf); return -ENOMEM; } - memset(brokenbuf, 0xdb, c->wbuf_pagesize); -#endif - return res; + + return 0; } void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index aa9132d0492..f7f8eff19b7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -88,7 +88,7 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -struct inode_operations jfs_file_inode_operations = { +const struct inode_operations jfs_file_inode_operations = { .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f5719117edf..e285022f006 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock, * Take appropriate lock on inode */ if (create) - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); else - IREAD_LOCK(ip); + IREAD_LOCK(ip, RDWRLOCK_NORMAL); if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && @@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip) nobh_truncate_page(ip->i_mapping, ip->i_size); - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); IWRITE_UNLOCK(ip); } diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index ddffbbd4d95..7378798f0b2 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -39,10 +39,6 @@ extern void jfs_proc_clean(void); /* * assert with traditional printf/panic */ -#ifdef CONFIG_KERNEL_ASSERTS -/* kgdb stuff */ -#define assert(p) KERNEL_ASSERT(#p, p) -#else #define assert(p) do { \ if (!(p)) { \ printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ @@ -50,7 +46,6 @@ extern void jfs_proc_clean(void); BUG(); \ } \ } while (0) -#endif /* * debug ON diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 23546c8fd48..82b0544bd76 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be freed better be within the mapsize. */ if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { @@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * allocation group size, try to allocate anywhere. */ if (l2nb > bmp->db_agl2size) { - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); rc = dbAllocAny(bmp, nblocks, l2nb, results); @@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * the hint using a tiered strategy. */ if (nblocks <= BPERDMAP) { - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* get the buffer for the dmap containing the hint. */ @@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* try to satisfy the allocation request with blocks within * the same allocation group as the hint. */ - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) goto write_unlock; @@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * Let dbNextAG recommend a preferred allocation group */ agno = dbNextAG(ipbmap); - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); /* Try to allocate within this allocation group. if that fails, try to * allocate anywhere in the map. @@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) s64 lblkno; struct metapage *mp; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* * validate extent request: @@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) */ extblkno = lastblkno + 1; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* better be within the file system */ bmp = sbi->bmap; @@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be allocated better be within the mapsize. */ ASSERT(nblocks <= bmp->db_mapsize - blkno); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 53f63b47a6d..aa5124b643b 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -331,7 +331,7 @@ int diRead(struct inode *ip) /* read the iag */ imap = JFS_IP(ipimap)->i_imap; - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { @@ -920,7 +920,7 @@ int diFree(struct inode *ip) /* Obtain read lock in imap inode. Don't release it until we have * read all of the IAG's that we are going to. */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag. */ @@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) AG_LOCK(imap, agno); /* Get read lock on imap inode */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* get the iag number and read the iag */ iagno = INOTOIAG(inum); @@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) return -ENOSPC; /* obtain read lock on imap inode */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); /* read the iag at the head of the list. */ @@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) } else { /* read the iag. */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "diAllocExt: error reading iag"); @@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) */ /* acquire inode map lock */ - IWRITE_LOCK(ipimap); + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { IWRITE_UNLOCK(ipimap); @@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) } /* obtain read lock on map */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag */ if ((rc = diIAGRead(imap, iagno, &mp))) { @@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap, return -EIO; } /* read the iag */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 94005584445..8f453eff3c8 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -109,9 +109,11 @@ struct jfs_inode_info { #define JFS_ACL_NOT_CACHED ((void *)-1) -#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock) +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) -#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) /* @@ -127,6 +129,29 @@ enum cflags { COMMIT_Synclist, /* metadata pages on group commit synclist */ }; +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + #define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) #define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) #define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 0d06ccfaff0..6802837f757 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -35,10 +35,10 @@ extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations jfs_aops; -extern struct inode_operations jfs_dir_inode_operations; +extern const struct inode_operations jfs_dir_inode_operations; extern const struct file_operations jfs_dir_operations; -extern struct inode_operations jfs_file_inode_operations; +extern const struct inode_operations jfs_file_inode_operations; extern const struct file_operations jfs_file_operations; -extern struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_symlink_inode_operations; extern struct dentry_operations jfs_ci_dentry_operations; #endif /* _H_JFS_INODE */ diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h index 7d78e83d7c4..df48ece4b7a 100644 --- a/fs/jfs/jfs_lock.h +++ b/fs/jfs/jfs_lock.h @@ -42,7 +42,7 @@ do { \ if (cond) \ break; \ unlock_cmd; \ - schedule(); \ + io_schedule(); \ lock_cmd; \ } \ current->state = TASK_RUNNING; \ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index ceaf03b9493..58deae00750 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp) set_current_state(TASK_UNINTERRUPTIBLE); if (metapage_locked(mp)) { unlock_page(mp->page); - schedule(); + io_schedule(); lock_page(mp->page); } } while (trylock_metapage(mp)); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d558e51b0df..6988a1082f5 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) add_wait_queue(event, &wait); set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); - schedule(); + io_schedule(); current->state = TASK_RUNNING; remove_wait_queue(event, &wait); } diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index e98eb03e531..acc97c46d8a 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, nsplit = 0; /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtSearch!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(btstack, bn, index); /* get the child page block number */ @@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ @@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a6a8c16c872..41c20477126 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((rc = get_UCSname(&dname, dentry))) goto out; - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry, tid = txBegin(ip->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); /* * scan parent directory for entry/freespace @@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_security(tid, ip, dip); if (rc) @@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out3; } } else if (new_ip) { - IWRITE_LOCK(new_ip); + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ DQUOT_INIT(new_ip); } @@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ tid = txBegin(new_dir->i_sb, 0); - mutex_lock(&JFS_IP(new_dir)->commit_mutex); - mutex_lock(&JFS_IP(old_ip)->commit_mutex); + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); if (old_dir != new_dir) - mutex_lock(&JFS_IP(old_dir)->commit_mutex); + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); if (new_ip) { - mutex_lock(&JFS_IP(new_ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); /* * Change existing directory entry to new inode number */ @@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, tid = txBegin(dir->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dir); if (rc) @@ -1495,7 +1503,7 @@ struct dentry *jfs_get_parent(struct dentry *dentry) return parent; } -struct inode_operations jfs_dir_inode_operations = { +const struct inode_operations jfs_dir_inode_operations = { .create = jfs_create, .lookup = jfs_lookup, .link = jfs_link, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 846ac8f3451..52d73d54a93 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -46,7 +46,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache * jfs_inode_cachep; -static struct super_operations jfs_super_operations; +static const struct super_operations jfs_super_operations; static struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; @@ -716,7 +716,7 @@ out: #endif -static struct super_operations jfs_super_operations = { +static const struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .destroy_inode = jfs_destroy_inode, .read_inode = jfs_read_inode, diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index cee43f36f51..4af1a05aad0 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -29,7 +29,7 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations jfs_symlink_inode_operations = { +const struct inode_operations jfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jfs_follow_link, .setxattr = jfs_setxattr, diff --git a/fs/libfs.c b/fs/libfs.c index 503898d5c4a..7d487047dbb 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -186,7 +186,7 @@ const struct file_operations simple_dir_operations = { .fsync = simple_sync_file, }; -struct inode_operations simple_dir_inode_operations = { +const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; @@ -195,11 +195,11 @@ struct inode_operations simple_dir_inode_operations = { * will never be mountable) */ int get_sb_pseudo(struct file_system_type *fs_type, char *name, - struct super_operations *ops, unsigned long magic, + const struct super_operations *ops, unsigned long magic, struct vfsmount *mnt) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); - static struct super_operations default_ops = {.statfs = simple_statfs}; + static const struct super_operations default_ops = {.statfs = simple_statfs}; struct dentry *dentry; struct inode *root; struct qstr d_name = {.name = name, .len = strlen(name)}; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 0b4acc1c5e7..a5c019e1a44 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -361,7 +361,6 @@ static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message * { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; - int status = -ENOLCK; dprintk("lockd: call procedure %d on %s (async)\n", (int)proc, host->h_name); @@ -373,12 +372,10 @@ static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message * msg->rpc_proc = &clnt->cl_procinfo[proc]; /* bootstrap and kick off the async RPC call */ - status = rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req); - if (status == 0) - return 0; + return rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req); out_err: - nlm_release_call(req); - return status; + tk_ops->rpc_release(req); + return -ENOLCK; } int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 3d4610c2a26..ad21c0713ef 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -9,7 +9,6 @@ */ #include <linux/types.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/sunrpc/clnt.h> @@ -192,7 +191,7 @@ struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *rqstp, const char *hostname, int hostname_len) { - return nlm_lookup_host(1, &rqstp->rq_addr, + return nlm_lookup_host(1, svc_addr_in(rqstp), rqstp->rq_prot, rqstp->rq_vers, hostname, hostname_len); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 8ca18085e68..126b1bf02c0 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -141,6 +141,7 @@ lockd(struct svc_rqst *rqstp) */ while ((nlmsvc_users || !signalled()) && nlmsvc_pid == current->pid) { long timeout = MAX_SCHEDULE_TIMEOUT; + char buf[RPC_MAX_ADDRBUFLEN]; if (signalled()) { flush_signals(current); @@ -175,11 +176,10 @@ lockd(struct svc_rqst *rqstp) break; } - dprintk("lockd: request from %08x\n", - (unsigned)ntohl(rqstp->rq_addr.sin_addr.s_addr)); + dprintk("lockd: request from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); svc_process(rqstp); - } flush_signals(current); @@ -223,23 +223,29 @@ static int find_socket(struct svc_serv *serv, int proto) return found; } +/* + * Make any sockets that are needed but not present. + * If nlm_udpport or nlm_tcpport were set as module + * options, make those sockets unconditionally + */ static int make_socks(struct svc_serv *serv, int proto) { - /* Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally - */ - static int warned; + static int warned; int err = 0; + if (proto == IPPROTO_UDP || nlm_udpport) if (!find_socket(serv, IPPROTO_UDP)) - err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport); - if (err == 0 && (proto == IPPROTO_TCP || nlm_tcpport)) + err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, + SVC_SOCK_DEFAULTS); + if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) if (!find_socket(serv, IPPROTO_TCP)) - err= svc_makesock(serv, IPPROTO_TCP, nlm_tcpport); - if (!err) + err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, + SVC_SOCK_DEFAULTS); + + if (err >= 0) { warned = 0; - else if (warned++ == 0) + err = 0; + } else if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); return err; @@ -434,7 +440,7 @@ static ctl_table nlm_sysctl_root[] = { }; /* - * Module (and driverfs) parameters. + * Module (and sysfs) parameters. */ #define param_set_min_max(name, type, which_strtol, min, max) \ @@ -506,7 +512,7 @@ module_param(nsm_use_hostnames, bool, 0644); static int __init init_nlm(void) { - nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root, 0); + nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); return nlm_sysctl_table ? 0 : -ENOMEM; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index f67146a8199..47a66aa5d55 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -224,7 +224,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); + resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -421,15 +421,16 @@ static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr = rqstp->rq_addr; + struct sockaddr_in saddr; + + memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); dprintk("lockd: SM_NOTIFY called\n"); if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) || ntohs(saddr.sin_port) >= 1024) { - printk(KERN_WARNING - "lockd: rejected NSM callback from %08x:%d\n", - ntohl(rqstp->rq_addr.sin_addr.s_addr), - ntohs(rqstp->rq_addr.sin_port)); + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return rpc_system_err; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c7db0a5bccd..cf51f849e76 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -593,9 +593,7 @@ callback: /* Call the client */ kref_get(&block->b_count); - if (nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, - &nlmsvc_grant_ops) < 0) - nlmsvc_release_block(block); + nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, &nlmsvc_grant_ops); } /* diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3707c3a23e9..31cb4842573 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -253,7 +253,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); + resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -452,15 +452,16 @@ static __be32 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr = rqstp->rq_addr; + struct sockaddr_in saddr; + + memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); dprintk("lockd: SM_NOTIFY called\n"); if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) || ntohs(saddr.sin_port) >= 1024) { - printk(KERN_WARNING - "lockd: rejected NSM callback from %08x:%d\n", - ntohl(rqstp->rq_addr.sin_addr.s_addr), - ntohs(rqstp->rq_addr.sin_port)); + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return rpc_system_err; } diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index df6b1075b54..c4a554df7b7 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -26,14 +26,14 @@ static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, _ for (i=0; i<numblocks-1; i++) { if (!(bh=map[i])) return(0); - for (j=0; j<BLOCK_SIZE; j++) + for (j=0; j<bh->b_size; j++) sum += nibblemap[bh->b_data[j] & 0xf] + nibblemap[(bh->b_data[j]>>4) & 0xf]; } if (numblocks==0 || !(bh=map[numblocks-1])) return(0); - i = ((numbits-(numblocks-1)*BLOCK_SIZE*8)/16)*2; + i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; for (j=0; j<i; j++) { sum += nibblemap[bh->b_data[j] & 0xf] + nibblemap[(bh->b_data[j]>>4) & 0xf]; @@ -48,28 +48,29 @@ static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, _ return(sum); } -void minix_free_block(struct inode * inode, int block) +void minix_free_block(struct inode *inode, unsigned long block) { - struct super_block * sb = inode->i_sb; - struct minix_sb_info * sbi = minix_sb(sb); - struct buffer_head * bh; - unsigned int bit,zone; + struct super_block *sb = inode->i_sb; + struct minix_sb_info *sbi = minix_sb(sb); + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long bit, zone; if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { printk("Trying to free block not in datazone\n"); return; } zone = block - sbi->s_firstdatazone + 1; - bit = zone & 8191; - zone >>= 13; + bit = zone & ((1<<k) - 1); + zone >>= k; if (zone >= sbi->s_zmap_blocks) { printk("minix_free_block: nonexistent bitmap buffer\n"); return; } bh = sbi->s_zmap[zone]; lock_kernel(); - if (!minix_test_and_clear_bit(bit,bh->b_data)) - printk("free_block (%s:%d): bit already cleared\n", + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_block (%s:%lu): bit already cleared\n", sb->s_id, block); unlock_kernel(); mark_buffer_dirty(bh); @@ -79,6 +80,7 @@ void minix_free_block(struct inode * inode, int block) int minix_new_block(struct inode * inode) { struct minix_sb_info *sbi = minix_sb(inode->i_sb); + int bits_per_zone = 8 * inode->i_sb->s_blocksize; int i; for (i = 0; i < sbi->s_zmap_blocks; i++) { @@ -86,11 +88,12 @@ int minix_new_block(struct inode * inode) int j; lock_kernel(); - if ((j = minix_find_first_zero_bit(bh->b_data, 8192)) < 8192) { - minix_set_bit(j,bh->b_data); + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) { + minix_set_bit(j, bh->b_data); unlock_kernel(); mark_buffer_dirty(bh); - j += i*8192 + sbi->s_firstdatazone-1; + j += i * bits_per_zone + sbi->s_firstdatazone-1; if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) break; return j; @@ -137,6 +140,7 @@ minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) int block; struct minix_sb_info *sbi = minix_sb(sb); struct minix2_inode *p; + int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode); *bh = NULL; if (!ino || ino > sbi->s_ninodes) { @@ -146,14 +150,14 @@ minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) } ino--; block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + - ino / MINIX2_INODES_PER_BLOCK; + ino / minix2_inodes_per_block; *bh = sb_bread(sb, block); if (!*bh) { printk("Unable to read inode block\n"); return NULL; } p = (void *)(*bh)->b_data; - return p + ino % MINIX2_INODES_PER_BLOCK; + return p + ino % minix2_inodes_per_block; } /* Clear the link count and mode of a deleted inode on disk. */ @@ -185,26 +189,30 @@ static void minix_clear_inode(struct inode *inode) void minix_free_inode(struct inode * inode) { + struct super_block *sb = inode->i_sb; struct minix_sb_info *sbi = minix_sb(inode->i_sb); - struct buffer_head * bh; - unsigned long ino; + struct buffer_head *bh; + int k = sb->s_blocksize_bits + 3; + unsigned long ino, bit; ino = inode->i_ino; if (ino < 1 || ino > sbi->s_ninodes) { printk("minix_free_inode: inode 0 or nonexistent inode\n"); goto out; } - if ((ino >> 13) >= sbi->s_imap_blocks) { + bit = ino & ((1<<k) - 1); + ino >>= k; + if (ino >= sbi->s_imap_blocks) { printk("minix_free_inode: nonexistent imap in superblock\n"); goto out; } minix_clear_inode(inode); /* clear on-disk copy */ - bh = sbi->s_imap[ino >> 13]; + bh = sbi->s_imap[ino]; lock_kernel(); - if (!minix_test_and_clear_bit(ino & 8191, bh->b_data)) - printk("minix_free_inode: bit %lu already cleared\n", ino); + if (!minix_test_and_clear_bit(bit, bh->b_data)) + printk("minix_free_inode: bit %lu already cleared\n", bit); unlock_kernel(); mark_buffer_dirty(bh); out: @@ -217,35 +225,38 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) struct minix_sb_info *sbi = minix_sb(sb); struct inode *inode = new_inode(sb); struct buffer_head * bh; - int i,j; + int bits_per_zone = 8 * sb->s_blocksize; + unsigned long j; + int i; if (!inode) { *error = -ENOMEM; return NULL; } - j = 8192; + j = bits_per_zone; bh = NULL; *error = -ENOSPC; lock_kernel(); for (i = 0; i < sbi->s_imap_blocks; i++) { bh = sbi->s_imap[i]; - if ((j = minix_find_first_zero_bit(bh->b_data, 8192)) < 8192) + j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); + if (j < bits_per_zone) break; } - if (!bh || j >= 8192) { + if (!bh || j >= bits_per_zone) { unlock_kernel(); iput(inode); return NULL; } - if (minix_test_and_set_bit(j,bh->b_data)) { /* shouldn't happen */ - printk("new_inode: bit already set\n"); + if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ unlock_kernel(); + printk("minix_new_inode: bit already set\n"); iput(inode); return NULL; } unlock_kernel(); mark_buffer_dirty(bh); - j += i*8192; + j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { iput(inode); return NULL; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index ab782c4086f..cb4cb571fdd 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -4,6 +4,8 @@ * Copyright (C) 1991, 1992 Linus Torvalds * * minix directory handling functions + * + * Updated to filesystem version 3 by Daniel Aragones */ #include "minix.h" @@ -11,6 +13,7 @@ #include <linux/smp_lock.h> typedef struct minix_dir_entry minix_dirent; +typedef struct minix3_dir_entry minix3_dirent; static int minix_readdir(struct file *, void *, filldir_t); @@ -89,6 +92,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) unsigned long npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(sb); unsigned chunk_size = sbi->s_dirsize; + char *name; + __u32 inumber; lock_kernel(); @@ -105,16 +110,24 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) kaddr = (char *)page_address(page); p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; - for ( ; p <= limit ; p = minix_next_entry(p, sbi)) { - minix_dirent *de = (minix_dirent *)p; - if (de->inode) { + for ( ; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } + if (inumber) { int over; - unsigned l = strnlen(de->name,sbi->s_namelen); + unsigned l = strnlen(name, sbi->s_namelen); offset = p - kaddr; - over = filldir(dirent, de->name, l, - (n<<PAGE_CACHE_SHIFT) | offset, - de->inode, DT_UNKNOWN); + over = filldir(dirent, name, l, + (n << PAGE_CACHE_SHIFT) | offset, + inumber, DT_UNKNOWN); if (over) { dir_put_page(page); goto done; @@ -156,23 +169,34 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) unsigned long n; unsigned long npages = dir_pages(dir); struct page *page = NULL; - struct minix_dir_entry *de; + char *p; + char *namx; + __u32 inumber; *res_page = NULL; for (n = 0; n < npages; n++) { - char *kaddr; + char *kaddr, *limit; + page = dir_get_page(dir, n); if (IS_ERR(page)) continue; kaddr = (char*)page_address(page); - de = (struct minix_dir_entry *) kaddr; - kaddr += minix_last_byte(dir, n) - sbi->s_dirsize; - for ( ; (char *) de <= kaddr ; de = minix_next_entry(de,sbi)) { - if (!de->inode) + limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + namx = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + namx = de->name; + inumber = de->inode; + } + if (!inumber) continue; - if (namecompare(namelen,sbi->s_namelen,name,de->name)) + if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } dir_put_page(page); @@ -181,7 +205,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) found: *res_page = page; - return de; + return (minix_dirent *)p; } int minix_add_link(struct dentry *dentry, struct inode *inode) @@ -192,12 +216,15 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); struct page *page = NULL; - struct minix_dir_entry * de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; + char *kaddr, *p; + minix_dirent *de; + minix3_dirent *de3; unsigned from, to; int err; + char *namx = NULL; + __u32 inumber; /* * We take care of directory expansion in the same loop @@ -205,7 +232,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) * to protect that region. */ for (n = 0; n <= npages; n++) { - char *dir_end; + char *limit, *dir_end; page = dir_get_page(dir, n); err = PTR_ERR(page); @@ -214,20 +241,30 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) lock_page(page); kaddr = (char*)page_address(page); dir_end = kaddr + minix_last_byte(dir, n); - de = (minix_dirent *)kaddr; - kaddr += PAGE_CACHE_SIZE - sbi->s_dirsize; - while ((char *)de <= kaddr) { - if ((char *)de == dir_end) { + limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + de = (minix_dirent *)p; + de3 = (minix3_dirent *)p; + if (sbi->s_version == MINIX_V3) { + namx = de3->name; + inumber = de3->inode; + } else { + namx = de->name; + inumber = de->inode; + } + if (p == dir_end) { /* We hit i_size */ - de->inode = 0; + if (sbi->s_version == MINIX_V3) + de3->inode = 0; + else + de->inode = 0; goto got_it; } - if (!de->inode) + if (!inumber) goto got_it; err = -EEXIST; - if (namecompare(namelen,sbi->s_namelen,name,de->name)) + if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; - de = minix_next_entry(de, sbi); } unlock_page(page); dir_put_page(page); @@ -236,14 +273,19 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) return -EINVAL; got_it: - from = (char*)de - (char*)page_address(page); + from = p - (char*)page_address(page); to = from + sbi->s_dirsize; err = page->mapping->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; - memcpy (de->name, name, namelen); - memset (de->name + namelen, 0, sbi->s_dirsize - namelen - 2); - de->inode = inode->i_ino; + memcpy (namx, name, namelen); + if (sbi->s_version == MINIX_V3) { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4); + de3->inode = inode->i_ino; + } else { + memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); + de->inode = inode->i_ino; + } err = dir_commit_chunk(page, from, to); dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); @@ -283,8 +325,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) { struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); - struct minix_sb_info * sbi = minix_sb(inode->i_sb); - struct minix_dir_entry * de; + struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; @@ -299,12 +340,23 @@ int minix_make_empty(struct inode *inode, struct inode *dir) kaddr = kmap_atomic(page, KM_USER0); memset(kaddr, 0, PAGE_CACHE_SIZE); - de = (struct minix_dir_entry *)kaddr; - de->inode = inode->i_ino; - strcpy(de->name,"."); - de = minix_next_entry(de, sbi); - de->inode = dir->i_ino; - strcpy(de->name,".."); + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)kaddr; + + de3->inode = inode->i_ino; + strcpy(de3->name, "."); + de3 = minix_next_entry(de3, sbi); + de3->inode = dir->i_ino; + strcpy(de3->name, ".."); + } else { + minix_dirent *de = (minix_dirent *)kaddr; + + de->inode = inode->i_ino; + strcpy(de->name, "."); + de = minix_next_entry(de, sbi); + de->inode = dir->i_ino; + strcpy(de->name, ".."); + } kunmap_atomic(kaddr, KM_USER0); err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); @@ -321,33 +373,41 @@ int minix_empty_dir(struct inode * inode) struct page *page = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); + char *name; + __u32 inumber; for (i = 0; i < npages; i++) { - char *kaddr; - minix_dirent * de; - page = dir_get_page(inode, i); + char *p, *kaddr, *limit; + page = dir_get_page(inode, i); if (IS_ERR(page)) continue; kaddr = (char *)page_address(page); - de = (minix_dirent *)kaddr; - kaddr += minix_last_byte(inode, i) - sbi->s_dirsize; + limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; + for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { + if (sbi->s_version == MINIX_V3) { + minix3_dirent *de3 = (minix3_dirent *)p; + name = de3->name; + inumber = de3->inode; + } else { + minix_dirent *de = (minix_dirent *)p; + name = de->name; + inumber = de->inode; + } - while ((char *)de <= kaddr) { - if (de->inode != 0) { + if (inumber != 0) { /* check for . and .. */ - if (de->name[0] != '.') + if (name[0] != '.') goto not_empty; - if (!de->name[1]) { - if (de->inode != inode->i_ino) + if (!name[1]) { + if (inumber != inode->i_ino) goto not_empty; - } else if (de->name[1] != '.') + } else if (name[1] != '.') goto not_empty; - else if (de->name[2]) + else if (name[2]) goto not_empty; } - de = minix_next_entry(de, sbi); } dir_put_page(page); } diff --git a/fs/minix/file.c b/fs/minix/file.c index 40eac2e60d2..f92baa1d757 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -26,7 +26,7 @@ const struct file_operations minix_file_operations = { .sendfile = generic_file_sendfile, }; -struct inode_operations minix_file_inode_operations = { +const struct inode_operations minix_file_inode_operations = { .truncate = minix_truncate, .getattr = minix_getattr, }; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 629e09b38c5..92e383af370 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -7,6 +7,7 @@ * Minix V2 fs support. * * Modified for 680x0 by Andreas Schwab + * Updated to filesystem version 3 by Daniel Aragones */ #include <linux/module.h> @@ -36,7 +37,8 @@ static void minix_put_super(struct super_block *sb) struct minix_sb_info *sbi = minix_sb(sb); if (!(sb->s_flags & MS_RDONLY)) { - sbi->s_ms->s_state = sbi->s_mount_state; + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + sbi->s_ms->s_state = sbi->s_mount_state; mark_buffer_dirty(sbi->s_sbh); } for (i = 0; i < sbi->s_imap_blocks; i++) @@ -93,7 +95,7 @@ static void destroy_inodecache(void) kmem_cache_destroy(minix_inode_cachep); } -static struct super_operations minix_sops = { +static const struct super_operations minix_sops = { .alloc_inode = minix_alloc_inode, .destroy_inode = minix_destroy_inode, .read_inode = minix_read_inode, @@ -117,12 +119,17 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) !(sbi->s_mount_state & MINIX_VALID_FS)) return 0; /* Mounting a rw partition read-only. */ - ms->s_state = sbi->s_mount_state; + if (sbi->s_version != MINIX_V3) + ms->s_state = sbi->s_mount_state; mark_buffer_dirty(sbi->s_sbh); } else { /* Mount a partition which is read-only, read-write. */ - sbi->s_mount_state = ms->s_state; - ms->s_state &= ~MINIX_VALID_FS; + if (sbi->s_version != MINIX_V3) { + sbi->s_mount_state = ms->s_state; + ms->s_state &= ~MINIX_VALID_FS; + } else { + sbi->s_mount_state = MINIX_VALID_FS; + } mark_buffer_dirty(sbi->s_sbh); if (!(sbi->s_mount_state & MINIX_VALID_FS)) @@ -140,7 +147,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) struct buffer_head *bh; struct buffer_head **map; struct minix_super_block *ms; - int i, block; + struct minix3_super_block *m3s = NULL; + unsigned long i, block; struct inode *root_inode; struct minix_sb_info *sbi; @@ -192,6 +200,22 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) sbi->s_dirsize = 32; sbi->s_namelen = 30; sbi->s_link_max = MINIX2_LINK_MAX; + } else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) { + m3s = (struct minix3_super_block *) bh->b_data; + s->s_magic = m3s->s_magic; + sbi->s_imap_blocks = m3s->s_imap_blocks; + sbi->s_zmap_blocks = m3s->s_zmap_blocks; + sbi->s_firstdatazone = m3s->s_firstdatazone; + sbi->s_log_zone_size = m3s->s_log_zone_size; + sbi->s_max_size = m3s->s_max_size; + sbi->s_ninodes = m3s->s_ninodes; + sbi->s_nzones = m3s->s_zones; + sbi->s_dirsize = 64; + sbi->s_namelen = 60; + sbi->s_version = MINIX_V3; + sbi->s_link_max = MINIX2_LINK_MAX; + sbi->s_mount_state = MINIX_VALID_FS; + sb_set_blocksize(s, m3s->s_blocksize); } else goto out_no_fs; @@ -236,7 +260,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) s->s_root->d_op = &minix_dentry_operations; if (!(s->s_flags & MS_RDONLY)) { - ms->s_state &= ~MINIX_VALID_FS; + if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ + ms->s_state &= ~MINIX_VALID_FS; mark_buffer_dirty(bh); } if (!(sbi->s_mount_state & MINIX_VALID_FS)) @@ -278,8 +303,8 @@ out_illegal_sb: out_no_fs: if (!silent) - printk("VFS: Can't find a Minix or Minix V2 filesystem " - "on device %s\n", s->s_id); + printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 " + "on device %s.\n", s->s_id); out_release: brelse(bh); goto out; @@ -344,7 +369,7 @@ static const struct address_space_operations minix_aops = { .bmap = minix_bmap }; -static struct inode_operations minix_symlink_inode_operations = { +static const struct inode_operations minix_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -537,12 +562,14 @@ int minix_sync_inode(struct inode * inode) int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; generic_fillattr(dentry->d_inode, stat); if (INODE_VERSION(dentry->d_inode) == MINIX_V1) - stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size); + stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else - stat->blocks = (BLOCK_SIZE / 512) * V2_minix_blocks(stat->size); - stat->blksize = BLOCK_SIZE; + stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); + stat->blksize = sb->s_blocksize; return 0; } diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 429baf8de10..a731cabf154 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -23,7 +23,7 @@ static inline int verify_chain(Indirect *from, Indirect *to) static inline block_t *block_end(struct buffer_head *bh) { - return (block_t *)((char*)bh->b_data + BLOCK_SIZE); + return (block_t *)((char*)bh->b_data + bh->b_size); } static inline Indirect *get_branch(struct inode *inode, @@ -85,7 +85,7 @@ static int alloc_branch(struct inode *inode, branch[n].key = cpu_to_block(nr); bh = sb_getblk(inode->i_sb, parent); lock_buffer(bh); - memset(bh->b_data, 0, BLOCK_SIZE); + memset(bh->b_data, 0, bh->b_size); branch[n].bh = bh; branch[n].p = (block_t*) bh->b_data + offsets[n]; *branch[n].p = branch[n].key; @@ -292,6 +292,7 @@ static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth static inline void truncate (struct inode * inode) { + struct super_block *sb = inode->i_sb; block_t *idata = i_data(inode); int offsets[DEPTH]; Indirect chain[DEPTH]; @@ -301,7 +302,7 @@ static inline void truncate (struct inode * inode) int first_whole; long iblock; - iblock = (inode->i_size + BLOCK_SIZE-1) >> 10; + iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits; block_truncate_page(inode->i_mapping, inode->i_size, get_block); n = block_to_path(inode, iblock, offsets); @@ -346,15 +347,16 @@ do_indirects: mark_inode_dirty(inode); } -static inline unsigned nblocks(loff_t size) +static inline unsigned nblocks(loff_t size, struct super_block *sb) { + int k = sb->s_blocksize_bits - 10; unsigned blocks, res, direct = DIRECT, i = DEPTH; - blocks = (size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k); res = blocks; while (--i && blocks > direct) { blocks -= direct; - blocks += BLOCK_SIZE/sizeof(block_t) - 1; - blocks /= BLOCK_SIZE/sizeof(block_t); + blocks += sb->s_blocksize/sizeof(block_t) - 1; + blocks /= sb->s_blocksize/sizeof(block_t); res += blocks; direct = 1; } diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 656b1347a25..1a5f3bf0bce 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -55,7 +55,7 @@ void V1_minix_truncate(struct inode * inode) truncate(inode); } -unsigned V1_minix_blocks(loff_t size) +unsigned V1_minix_blocks(loff_t size, struct super_block *sb) { - return nblocks(size); + return nblocks(size, sb); } diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 9adcdc754e0..ad8f0dec4ef 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -23,10 +23,11 @@ static inline block_t *i_data(struct inode *inode) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; + struct super_block *sb = inode->i_sb; if (block < 0) { printk("minix_bmap: block<0\n"); - } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { + } else if (block >= (minix_sb(inode->i_sb)->s_max_size/sb->s_blocksize)) { printk("minix_bmap: block>big\n"); } else if (block < 7) { offsets[n++] = block; @@ -60,7 +61,7 @@ void V2_minix_truncate(struct inode * inode) truncate(inode); } -unsigned V2_minix_blocks(loff_t size) +unsigned V2_minix_blocks(loff_t size, struct super_block *sb) { - return nblocks(size); + return nblocks(size, sb); } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index c55b77cdcc8..73ef84f8fb0 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -7,11 +7,10 @@ * truncated. Else they will be disallowed (ENAMETOOLONG). */ #define NO_TRUNCATE 1 - #define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version - #define MINIX_V1 0x0001 /* original minix fs */ #define MINIX_V2 0x0002 /* minix V2 fs */ +#define MINIX_V3 0x0003 /* minix V3 fs */ /* * minix fs inode data in memory @@ -52,12 +51,10 @@ extern struct inode * minix_new_inode(const struct inode * dir, int * error); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); extern int minix_new_block(struct inode * inode); -extern void minix_free_block(struct inode * inode, int block); +extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); - extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern void V2_minix_truncate(struct inode *); extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); extern void minix_truncate(struct inode *); @@ -65,8 +62,8 @@ extern int minix_sync_inode(struct inode *); extern void minix_set_inode(struct inode *, dev_t); extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); -extern unsigned V1_minix_blocks(loff_t); -extern unsigned V2_minix_blocks(loff_t); +extern unsigned V1_minix_blocks(loff_t, struct super_block *); +extern unsigned V2_minix_blocks(loff_t, struct super_block *); extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); extern int minix_add_link(struct dentry*, struct inode*); @@ -76,11 +73,10 @@ extern int minix_empty_dir(struct inode*); extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); extern ino_t minix_inode_by_name(struct dentry*); - extern int minix_sync_file(struct file *, struct dentry *, int); -extern struct inode_operations minix_file_inode_operations; -extern struct inode_operations minix_dir_inode_operations; +extern const struct inode_operations minix_file_inode_operations; +extern const struct inode_operations minix_dir_inode_operations; extern const struct file_operations minix_file_operations; extern const struct file_operations minix_dir_operations; extern struct dentry_operations minix_dentry_operations; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 299bb66e3bd..f4aa7a93904 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -291,7 +291,7 @@ out: /* * directories can handle most operations... */ -struct inode_operations minix_dir_inode_operations = { +const struct inode_operations minix_dir_inode_operations = { .create = minix_create, .lookup = minix_lookup, .link = minix_link, diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 452461955cb..30f7d0ae221 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -646,7 +646,7 @@ out: return err; } -static struct inode_operations msdos_dir_inode_operations = { +static const struct inode_operations msdos_dir_inode_operations = { .create = msdos_create, .lookup = msdos_lookup, .unlink = msdos_unlink, diff --git a/fs/namei.c b/fs/namei.c index e4f108f0823..ee60cc4d345 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2688,10 +2688,11 @@ int __page_symlink(struct inode *inode, const char *symname, int len, { struct address_space *mapping = inode->i_mapping; struct page *page; - int err = -ENOMEM; + int err; char *kaddr; retry: + err = -ENOMEM; page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; @@ -2744,7 +2745,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) mapping_gfp_mask(inode->i_mapping)); } -struct inode_operations page_symlink_inode_operations = { +const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/namespace.c b/fs/namespace.c index 5ef336c1103..fd999cab7b5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -53,9 +53,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) struct vfsmount *alloc_vfsmnt(const char *name) { - struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); + struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { - memset(mnt, 0, sizeof(struct vfsmount)); atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 73747772c3b..011ef0b6d2d 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -58,7 +58,7 @@ const struct file_operations ncp_dir_operations = #endif }; -struct inode_operations ncp_dir_inode_operations = +const struct inode_operations ncp_dir_inode_operations = { .create = ncp_create, .lookup = ncp_lookup, diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index b91fea03b1c..6b1f6d27099 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -297,7 +297,7 @@ const struct file_operations ncp_file_operations = .fsync = ncp_fsync, }; -struct inode_operations ncp_file_inode_operations = +const struct inode_operations ncp_file_inode_operations = { .setattr = ncp_notify_change, }; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 67a90bf795d..14939ddf74f 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -90,7 +90,7 @@ static int ncp_remount(struct super_block *sb, int *flags, char* data) return 0; } -static struct super_operations ncp_sops = +static const struct super_operations ncp_sops = { .alloc_inode = ncp_alloc_inode, .destroy_inode = ncp_destroy_inode, @@ -229,7 +229,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) } #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) -static struct inode_operations ncp_symlink_inode_operations = { +static const struct inode_operations ncp_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7933e2e99db..75f309c8741 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -71,6 +71,8 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) complete(&nfs_callback_info.started); for(;;) { + char buf[RPC_MAX_ADDRBUFLEN]; + if (signalled()) { if (nfs_callback_info.users == 0) break; @@ -88,8 +90,8 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) __FUNCTION__, -err); break; } - dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__, - NIPQUAD(rqstp->rq_addr.sin_addr.s_addr)); + dprintk("%s: request from %s\n", __FUNCTION__, + svc_print_addr(rqstp, buf, sizeof(buf))); svc_process(rqstp); } @@ -106,7 +108,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) int nfs_callback_up(void) { struct svc_serv *serv; - struct svc_sock *svsk; int ret = 0; lock_kernel(); @@ -119,17 +120,14 @@ int nfs_callback_up(void) ret = -ENOMEM; if (!serv) goto out_err; - /* FIXME: We don't want to register this socket with the portmapper */ - ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport); - if (ret < 0) + + ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, + SVC_SOCK_ANONYMOUS); + if (ret <= 0) goto out_destroy; - if (!list_empty(&serv->sv_permsocks)) { - svsk = list_entry(serv->sv_permsocks.next, - struct svc_sock, sk_list); - nfs_callback_tcpport = ntohs(inet_sk(svsk->sk_sk)->sport); - dprintk ("Callback port = 0x%x\n", nfs_callback_tcpport); - } else - BUG(); + nfs_callback_tcpport = ret; + dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); + ret = svc_create_thread(nfs_callback_svc, serv); if (ret < 0) goto out_destroy; @@ -140,6 +138,8 @@ out: unlock_kernel(); return ret; out_destroy: + dprintk("Couldn't create callback socket or server thread; err = %d\n", + ret); svc_destroy(serv); out_err: nfs_callback_info.users--; @@ -166,15 +166,19 @@ void nfs_callback_down(void) static int nfs_callback_authenticate(struct svc_rqst *rqstp) { - struct sockaddr_in *addr = &rqstp->rq_addr; + struct sockaddr_in *addr = svc_addr_in(rqstp); struct nfs_client *clp; + char buf[RPC_MAX_ADDRBUFLEN]; /* Don't talk to strangers */ clp = nfs_find_client(addr, 4); if (clp == NULL) return SVC_DROP; - dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr->sin_addr)); + + dprintk("%s: %s NFSv4 callback!\n", __FUNCTION__, + svc_print_addr(rqstp, buf, sizeof(buf))); nfs_put_client(clp); + switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index f8ea1f51f59..849a2029975 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -176,7 +176,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) goto out; - args->addr = &rqstp->rq_addr; + args->addr = svc_addr_in(rqstp); status = decode_bitmap(xdr, args->bitmap); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, status); @@ -188,7 +188,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - args->addr = &rqstp->rq_addr; + args->addr = svc_addr_in(rqstp); status = decode_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 23ab145daa2..2190e6c2792 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -394,7 +394,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, static int nfs_create_rpc_client(struct nfs_client *clp, int proto, unsigned int timeo, unsigned int retrans, - rpc_authflavor_t flavor) + rpc_authflavor_t flavor, + int flags) { struct rpc_timeout timeparms; struct rpc_clnt *clnt = NULL; @@ -407,6 +408,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, + .flags = flags, }; if (!IS_ERR(clp->cl_rpcclient)) @@ -548,7 +550,7 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data * * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans, - RPC_AUTH_UNIX); + RPC_AUTH_UNIX, 0); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -868,7 +870,8 @@ static int nfs4_init_client(struct nfs_client *clp, /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; - error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour); + error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour, + RPC_CLNT_CREATE_DISCRTRY); if (error < 0) goto error; memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); @@ -1030,7 +1033,7 @@ error: * Create an NFS4 referral server record */ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, - struct nfs_fh *fh) + struct nfs_fh *mntfh) { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; @@ -1069,8 +1072,13 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_path_walk(server, mntfh, data->mnt_path); + if (error < 0) + goto error; + /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, fh, &fattr); + error = nfs_probe_fsinfo(server, mntfh, &fattr); if (error < 0) goto error; @@ -1173,7 +1181,7 @@ static struct seq_operations nfs_server_list_ops = { .show = nfs_server_list_show, }; -static struct file_operations nfs_server_list_fops = { +static const struct file_operations nfs_server_list_fops = { .open = nfs_server_list_open, .read = seq_read, .llseek = seq_lseek, @@ -1193,7 +1201,7 @@ static struct seq_operations nfs_volume_list_ops = { .show = nfs_volume_list_show, }; -static struct file_operations nfs_volume_list_fops = { +static const struct file_operations nfs_volume_list_fops = { .open = nfs_volume_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9ba8cb0ee7..92d8ec859e2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -65,7 +65,7 @@ const struct file_operations nfs_dir_operations = { .fsync = nfs_fsync_dir, }; -struct inode_operations nfs_dir_inode_operations = { +const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, .link = nfs_link, @@ -81,7 +81,7 @@ struct inode_operations nfs_dir_inode_operations = { }; #ifdef CONFIG_NFS_V3 -struct inode_operations nfs3_dir_inode_operations = { +const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, .link = nfs_link, @@ -104,7 +104,7 @@ struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -struct inode_operations nfs4_dir_inode_operations = { +const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_atomic_lookup, .link = nfs_link, @@ -637,7 +637,7 @@ int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) * In the case it has, we assume that the dentries are untrustworthy * and may need to be looked up again. */ -static inline int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; @@ -652,6 +652,12 @@ static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) dentry->d_fsdata = (void *)verf; } +static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf) +{ + if (time_after(verf, (unsigned long)dentry->d_fsdata)) + nfs_set_verifier(dentry, verf); +} + /* * Whenever an NFS operation succeeds, we know that the dentry * is valid, so we update the revalidation timestamp. @@ -785,7 +791,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) goto out_bad; nfs_renew_times(dentry); - nfs_set_verifier(dentry, verifier); + nfs_refresh_verifier(dentry, verifier); out_valid: unlock_kernel(); dput(parent); @@ -1085,7 +1091,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) verifier = nfs_save_change_attribute(dir); ret = nfs4_open_revalidate(dir, dentry, openflags, nd); if (!ret) - nfs_set_verifier(dentry, verifier); + nfs_refresh_verifier(dentry, verifier); unlock_kernel(); out: dput(parent); @@ -1123,8 +1129,21 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) } name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); - if (dentry != NULL) - return dentry; + if (dentry != NULL) { + /* Is this a positive dentry that matches the readdir info? */ + if (dentry->d_inode != NULL && + (NFS_FILEID(dentry->d_inode) == entry->ino || + d_mountpoint(dentry))) { + if (!desc->plus || entry->fh->size == 0) + return dentry; + if (nfs_compare_fh(NFS_FH(dentry->d_inode), + entry->fh) == 0) + goto out_renew; + } + /* No, so d_drop to allow one to be created */ + d_drop(dentry); + dput(dentry); + } if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) return NULL; /* Note: caller is already holding the dir->i_mutex! */ @@ -1149,6 +1168,10 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return dentry; +out_renew: + nfs_renew_times(dentry); + nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir)); + return dentry; } /* @@ -1443,6 +1466,8 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) if (atomic_read(&dentry->d_count) > 1) { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); unlock_kernel(); return error; @@ -1684,7 +1709,7 @@ out: if (!error) { d_move(old_dentry, new_dentry); nfs_renew_times(new_dentry); - nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); + nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir)); } /* new dentry created? */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bd21d7fde65..b1c98ea39b7 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -309,7 +309,8 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo rpc_execute(&data->task); - dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n", + dprintk("NFS: %5u initiated direct read call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -639,7 +640,8 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l rpc_execute(&data->task); - dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n", + dprintk("NFS: %5u initiated direct write call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -797,7 +799,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, const char __user *buf = iov[0].iov_base; size_t count = iov[0].iov_len; - dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n", + dprintk("nfs: direct write(%s/%s, %lu@%Ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, (unsigned long) count, (long long) pos); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9e4a2b70995..8e66b5a2d49 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -68,14 +68,14 @@ const struct file_operations nfs_file_operations = { .check_flags = nfs_check_flags, }; -struct inode_operations nfs_file_inode_operations = { +const struct inode_operations nfs_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, }; #ifdef CONFIG_NFS_V3 -struct inode_operations nfs3_file_inode_operations = { +const struct inode_operations nfs3_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 8391bd7a83c..6ef268f7c30 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -135,17 +135,15 @@ int nfs4_path_walk(struct nfs_server *server, struct nfs_fh lastfh; struct qstr name; int ret; - //int referral_count = 0; dprintk("--> nfs4_path_walk(,,%s)\n", path); fsinfo.fattr = &fattr; nfs_fattr_init(&fattr); - if (*path++ != '/') { - dprintk("nfs4_get_root: Path does not begin with a slash\n"); - return -EINVAL; - } + /* Eat leading slashes */ + while (*path == '/') + path++; /* Start by getting the root filehandle from the server */ ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); @@ -160,6 +158,7 @@ int nfs4_path_walk(struct nfs_server *server, return -ENOTDIR; } + /* FIXME: It is quite valid for the server to return a referral here */ if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { printk(KERN_ERR "nfs4_get_root:" " getroot obtained referral\n"); @@ -187,6 +186,7 @@ eat_dot_dir: goto eat_dot_dir; } + /* FIXME: Why shouldn't the user be able to use ".." in the path? */ if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) ) { printk(KERN_ERR "nfs4_get_root:" @@ -212,6 +212,7 @@ eat_dot_dir: return -ENOTDIR; } + /* FIXME: Referrals are quite valid here too */ if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { printk(KERN_ERR "nfs4_get_root:" " lookupfh obtained referral\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d8349828283..af53c02f473 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -65,13 +65,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_write_inode(struct inode *inode, int sync) { - int flags = sync ? FLUSH_SYNC : 0; int ret; - ret = nfs_commit_inode(inode, flags); - if (ret < 0) - return ret; - return 0; + if (sync) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret == 0) + ret = nfs_commit_inode(inode, FLUSH_SYNC); + } else + ret = nfs_commit_inode(inode, 0); + if (ret >= 0) + return 0; + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; } void nfs_clear_inode(struct inode *inode) @@ -235,6 +240,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (inode->i_state & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; /* We set i_ino for the few things that still rely on it, * such as stat(2) */ @@ -271,7 +277,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = jiffies; + nfsi->last_updated = now; + nfsi->cache_change_attribute = now; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -290,7 +297,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_blocks = fattr->du.nfs2.blocks; } nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); - nfsi->attrtimeo_timestamp = jiffies; + nfsi->attrtimeo_timestamp = now; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->access_cache = RB_ROOT; @@ -783,20 +790,21 @@ void nfs_end_data_update(struct inode *inode) static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_WCC) != 0) { if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } if (inode->i_size == fattr->pre_size && nfsi->npages == 0) { inode->i_size = fattr->size; - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } } } @@ -934,6 +942,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; unsigned int invalid = 0; + unsigned long now = jiffies; int data_stable; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", @@ -959,7 +968,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = jiffies; + nfsi->last_updated = now; + + /* Fix a wraparound issue with nfsi->cache_change_attribute */ + if (time_before(now, nfsi->cache_change_attribute)) + nfsi->cache_change_attribute = now - 600*HZ; /* Are we racing with known updates of the metadata on the server? */ data_stable = nfs_verify_change_attribute(inode, fattr->time_start); @@ -985,7 +998,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_size = new_isize; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } @@ -996,14 +1009,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: mtime change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); @@ -1032,18 +1045,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_sb->s_id, inode->i_ino); nfsi->change_attr = fattr->change_attr; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - nfsi->cache_change_attribute = jiffies; + nfsi->cache_change_attribute = now; } /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); - nfsi->attrtimeo_timestamp = jiffies; - } else if (time_after(jiffies, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { + nfsi->attrtimeo_timestamp = now; + } else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); - nfsi->attrtimeo_timestamp = jiffies; + nfsi->attrtimeo_timestamp = now; } /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) @@ -1122,7 +1135,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; - nfsi->cache_change_attribute = jiffies; #ifdef CONFIG_NFS_V3_ACL nfsi->acl_access = ERR_PTR(-EAGAIN); nfsi->acl_default = ERR_PTR(-EAGAIN); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a28f6ce2e13..6610f2b0207 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -107,10 +107,6 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 extern struct rpc_procinfo nfs4_procedures[]; - -extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, - struct nfs4_fs_locations *fs_locations, - struct page *page); #endif /* dir.c */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 371b804e7cc..7f86e65182e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -155,12 +155,12 @@ out_follow: goto out; } -struct inode_operations nfs_mountpoint_inode_operations = { +const struct inode_operations nfs_mountpoint_inode_operations = { .follow_link = nfs_follow_mountpoint, .getattr = nfs_getattr, }; -struct inode_operations nfs_referral_inode_operations = { +const struct inode_operations nfs_referral_inode_operations = { .follow_link = nfs_follow_mountpoint, }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index acd8fe9762d..7d0371e2bad 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -253,29 +253,6 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, return status; } -static int nfs3_proc_read(struct nfs_read_data *rdata) -{ - int flags = rdata->flags; - struct inode * inode = rdata->inode; - struct nfs_fattr * fattr = rdata->res.fattr; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_READ], - .rpc_argp = &rdata->args, - .rpc_resp = &rdata->res, - .rpc_cred = rdata->cred, - }; - int status; - - dprintk("NFS call read %d @ %Ld\n", rdata->args.count, - (long long) rdata->args.offset); - nfs_fattr_init(fattr); - status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); - if (status >= 0) - nfs_refresh_inode(inode, fattr); - dprintk("NFS reply read: %d\n", status); - return status; -} - /* * Create a regular file. * For now, we don't implement O_EXCL. @@ -855,7 +832,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .lookup = nfs3_proc_lookup, .access = nfs3_proc_access, .readlink = nfs3_proc_readlink, - .read = nfs3_proc_read, .create = nfs3_proc_create, .remove = nfs3_proc_remove, .unlink_setup = nfs3_proc_unlink_setup, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c26cd978c7c..cf3a17eb5c0 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -151,7 +151,7 @@ struct nfs4_state_recovery_ops { }; extern struct dentry_operations nfs4_dentry_operations; -extern struct inode_operations nfs4_dir_inode_operations; +extern const struct inode_operations nfs4_dir_inode_operations; /* inode.c */ extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); @@ -169,7 +169,7 @@ extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); -extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, +extern int nfs4_proc_fs_locations(struct inode *dir, struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b872779d7cd..dd5fef20c70 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -16,6 +16,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include "internal.h" +#include "nfs4_fs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -130,7 +131,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - char *devname; int loc, s, error; if (locations == NULL || locations->nlocations <= 0) @@ -154,12 +154,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); - if (IS_ERR(devname)) { - mnt = (struct vfsmount *)devname; - goto out; - } - loc = 0; while (loc < locations->nlocations && IS_ERR(mnt)) { const struct nfs4_fs_location *location = &locations->locations[loc]; @@ -194,7 +188,11 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, addr.sin_port = htons(NFS_PORT); mountdata.addr = &addr; - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, devname, &mountdata); + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata.hostname, + mountdata.mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); if (!IS_ERR(mnt)) { break; } @@ -242,7 +240,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name); - err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page); + err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page); dput(parent); if (err != 0 || fs_locations->nlocations <= 0 || diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b3fd29baadc..f52cf5c33c6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1140,7 +1140,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs_client); break; default: if (nfs4_async_handle_error(task, server) == -EAGAIN) { @@ -1424,7 +1423,6 @@ static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fa int status = -ENOMEM; struct page *page = NULL; struct nfs4_fs_locations *locations = NULL; - struct dentry dentry = {}; page = alloc_page(GFP_KERNEL); if (page == NULL) @@ -1433,9 +1431,7 @@ static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fa if (locations == NULL) goto out; - dentry.d_name.name = name->name; - dentry.d_name.len = name->len; - status = nfs4_proc_fs_locations(dir, &dentry, locations, page); + status = nfs4_proc_fs_locations(dir, name, locations, page); if (status != 0) goto out; /* Make sure server returned a different fsid for the referral */ @@ -1737,44 +1733,6 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, return err; } -static int _nfs4_proc_read(struct nfs_read_data *rdata) -{ - int flags = rdata->flags; - struct inode *inode = rdata->inode; - struct nfs_fattr *fattr = rdata->res.fattr; - struct nfs_server *server = NFS_SERVER(inode); - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ], - .rpc_argp = &rdata->args, - .rpc_resp = &rdata->res, - .rpc_cred = rdata->cred, - }; - unsigned long timestamp = jiffies; - int status; - - dprintk("NFS call read %d @ %Ld\n", rdata->args.count, - (long long) rdata->args.offset); - - nfs_fattr_init(fattr); - status = rpc_call_sync(server->client, &msg, flags); - if (!status) - renew_lease(server, timestamp); - dprintk("NFS reply read: %d\n", status); - return status; -} - -static int nfs4_proc_read(struct nfs_read_data *rdata) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(rdata->inode), - _nfs4_proc_read(rdata), - &exception); - } while (exception.retry); - return err; -} - /* * Got race? * We will need to arrange for the VFS layer to provide an atomic open. @@ -2753,11 +2711,15 @@ static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp) might_sleep(); + rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_); + rpc_clnt_sigmask(clnt, &oldset); res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER, nfs4_wait_bit_interruptible, TASK_INTERRUPTIBLE); rpc_clnt_sigunmask(clnt, &oldset); + + rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_); return res; } @@ -2996,7 +2958,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs_client); case 0: return 0; } @@ -3150,12 +3111,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(calldata->server->nfs_client); break; default: - if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) { + if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) rpc_restart_call(task); - } } } @@ -3585,7 +3544,7 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) return len; } -int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, +int nfs4_proc_fs_locations(struct inode *dir, struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); @@ -3595,7 +3554,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, }; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), - .name = &dentry->d_name, + .name = name, .page = page, .bitmask = bitmask, }; @@ -3607,7 +3566,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, int status; dprintk("%s: start\n", __FUNCTION__); - fs_locations->fattr.valid = 0; + nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = rpc_call_sync(server->client, &msg, 0); @@ -3625,7 +3584,7 @@ struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = { .recover_lock = nfs4_lock_expired, }; -static struct inode_operations nfs4_file_inode_operations = { +static const struct inode_operations nfs4_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, @@ -3646,7 +3605,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, - .read = nfs4_proc_read, .create = nfs4_proc_create, .remove = nfs4_proc_remove, .unlink_setup = nfs4_proc_unlink_setup, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 823298561c0..f5f4430fb2a 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -43,7 +43,6 @@ * child task framework of the RPC layer? */ -#include <linux/sched.h> #include <linux/smp_lock.h> #include <linux/mm.h> #include <linux/pagemap.h> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0cf3fa312a3..f02d522fd78 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -387,8 +387,10 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ op_decode_hdr_maxsz + 12) #define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 560536ad74a..1dcf56de948 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -186,35 +186,6 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page, return status; } -static int nfs_proc_read(struct nfs_read_data *rdata) -{ - int flags = rdata->flags; - struct inode * inode = rdata->inode; - struct nfs_fattr * fattr = rdata->res.fattr; - struct rpc_message msg = { - .rpc_proc = &nfs_procedures[NFSPROC_READ], - .rpc_argp = &rdata->args, - .rpc_resp = &rdata->res, - .rpc_cred = rdata->cred, - }; - int status; - - dprintk("NFS call read %d @ %Ld\n", rdata->args.count, - (long long) rdata->args.offset); - nfs_fattr_init(fattr); - status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); - if (status >= 0) { - nfs_refresh_inode(inode, fattr); - /* Emulate the eof flag, which isn't normally needed in NFSv2 - * as it is guaranteed to always return the file attributes - */ - if (rdata->args.offset + rdata->args.count >= fattr->size) - rdata->res.eof = 1; - } - dprintk("NFS reply read: %d\n", status); - return status; -} - static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nameidata *nd) @@ -666,7 +637,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lookup = nfs_proc_lookup, .access = NULL, /* access */ .readlink = nfs_proc_readlink, - .read = nfs_proc_read, .create = nfs_proc_create, .remove = nfs_proc_remove, .unlink_setup = nfs_proc_unlink_setup, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a9c26521a9e..6ab4d5a9edf 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -5,14 +5,6 @@ * * Partial copy of Linus' read cache modifications to fs/nfs/file.c * modified for async RPC by okir@monad.swb.de - * - * We do an ugly hack here in order to return proper error codes to the - * user program when a read request failed: since generic_file_read - * only checks the return value of inode->i_op->readpage() which is always 0 - * for async RPC, we set the error bit of the page to 1 when an error occurs, - * and make nfs_readpage transmit requests synchronously when encountering this. - * This is only a small problem, though, since we now retry all operations - * within the RPC code when root squashing is suspected. */ #include <linux/time.h> @@ -122,93 +114,6 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -/* - * Read a page synchronously. - */ -static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) -{ - unsigned int rsize = NFS_SERVER(inode)->rsize; - unsigned int count = PAGE_CACHE_SIZE; - int result = -ENOMEM; - struct nfs_read_data *rdata; - - rdata = nfs_readdata_alloc(count); - if (!rdata) - goto out_unlock; - - memset(rdata, 0, sizeof(*rdata)); - rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); - rdata->cred = ctx->cred; - rdata->inode = inode; - INIT_LIST_HEAD(&rdata->pages); - rdata->args.fh = NFS_FH(inode); - rdata->args.context = ctx; - rdata->args.pages = &page; - rdata->args.pgbase = 0UL; - rdata->args.count = rsize; - rdata->res.fattr = &rdata->fattr; - - dprintk("NFS: nfs_readpage_sync(%p)\n", page); - - /* - * This works now because the socket layer never tries to DMA - * into this buffer directly. - */ - do { - if (count < rsize) - rdata->args.count = count; - rdata->res.count = rdata->args.count; - rdata->args.offset = page_offset(page) + rdata->args.pgbase; - - dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - (unsigned long long)rdata->args.pgbase, - rdata->args.count); - - lock_kernel(); - result = NFS_PROTO(inode)->read(rdata); - unlock_kernel(); - - /* - * Even if we had a partial success we can't mark the page - * cache valid. - */ - if (result < 0) { - if (result == -EISDIR) - result = -EINVAL; - goto io_error; - } - count -= result; - rdata->args.pgbase += result; - nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, result); - - /* Note: result == 0 should only happen if we're caching - * a write that extends the file and punches a hole. - */ - if (rdata->res.eof != 0 || result == 0) - break; - } while (count); - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; - spin_unlock(&inode->i_lock); - - if (rdata->res.eof || rdata->res.count == rdata->args.count) { - SetPageUptodate(page); - if (rdata->res.eof && count != 0) - memclear_highpage_flush(page, rdata->args.pgbase, count); - } - result = 0; - -io_error: - nfs_readdata_free(rdata); -out_unlock: - unlock_page(page); - return result; -} - static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -278,7 +183,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->task.tk_cookie = (unsigned long)inode; - dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -452,7 +357,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) { int status; - dprintk("%s: %4d, (status %d)\n", __FUNCTION__, task->tk_pid, + dprintk("NFS: %s: %5u, (status %d)\n", __FUNCTION__, task->tk_pid, task->tk_status); status = NFS_PROTO(data->inode)->read_done(task, data); @@ -621,15 +526,9 @@ int nfs_readpage(struct file *file, struct page *page) } else ctx = get_nfs_open_context((struct nfs_open_context *) file->private_data); - if (!IS_SYNC(inode)) { - error = nfs_readpage_async(ctx, inode, page); - goto out; - } - error = nfs_readpage_sync(ctx, inode, page); - if (error < 0 && IS_SWAPFILE(inode)) - printk("Aiee.. nfs swap-in of page failed!\n"); -out: + error = nfs_readpage_async(ctx, inode, page); + put_nfs_open_context(ctx); return error; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 28108c82b88..bb516a2cfba 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -44,6 +44,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> +#include <linux/magic.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -81,7 +82,7 @@ struct file_system_type nfs_xdev_fs_type = { .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -static struct super_operations nfs_sops = { +static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, @@ -125,7 +126,7 @@ struct file_system_type nfs4_referral_fs_type = { .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -static struct super_operations nfs4_sops = { +static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, @@ -1044,7 +1045,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, nfs4_fill_super(s); } - mntroot = nfs4_get_root(s, data->fh); + mntroot = nfs4_get_root(s, &mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 525c136c7d8..f4a0548b9ce 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -78,7 +78,7 @@ read_failed: /* * symlinks can't do much... */ -struct inode_operations nfs_symlink_inode_operations = { +const struct inode_operations nfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = nfs_follow_link, .put_link = page_put_link, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 3ea50ac6482..fcdcafbb329 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -75,7 +75,7 @@ static ctl_table nfs_cb_sysctl_root[] = { int nfs_register_sysctl(void) { - nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root, 0); + nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); if (nfs_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 345492e7864..febdade9167 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1,47 +1,7 @@ /* * linux/fs/nfs/write.c * - * Writing file data over NFS. - * - * We do it like this: When a (user) process wishes to write data to an - * NFS file, a write request is allocated that contains the RPC task data - * plus some info on the page to be written, and added to the inode's - * write chain. If the process writes past the end of the page, an async - * RPC call to write the page is scheduled immediately; otherwise, the call - * is delayed for a few seconds. - * - * Just like readahead, no async I/O is performed if wsize < PAGE_SIZE. - * - * Write requests are kept on the inode's writeback list. Each entry in - * that list references the page (portion) to be written. When the - * cache timeout has expired, the RPC task is woken up, and tries to - * lock the page. As soon as it manages to do so, the request is moved - * from the writeback list to the writelock list. - * - * Note: we must make sure never to confuse the inode passed in the - * write_page request with the one in page->inode. As far as I understand - * it, these are different when doing a swap-out. - * - * To understand everything that goes on here and in the NFS read code, - * one should be aware that a page is locked in exactly one of the following - * cases: - * - * - A write request is in progress. - * - A user process is in generic_file_write/nfs_update_page - * - A user process is in generic_file_read - * - * Also note that because of the way pages are invalidated in - * nfs_revalidate_inode, the following assertions hold: - * - * - If a page is dirty, there will be no read requests (a page will - * not be re-read unless invalidated by nfs_revalidate_inode). - * - If the page is not uptodate, there will be no pending write - * requests, and no process will be in nfs_update_page. - * - * FIXME: Interaction with the vmscan routines is not optimal yet. - * Either vmscan must be made nfs-savvy, or we need a different page - * reclaim concept that supports something like FS-independent - * buffer_heads with a b_ops-> field. + * Write file data over NFS. * * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de> */ @@ -79,7 +39,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*, unsigned int, unsigned int); static void nfs_mark_request_dirty(struct nfs_page *req); static int nfs_wait_on_write_congestion(struct address_space *, int); -static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int); static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); static const struct rpc_call_ops nfs_write_partial_ops; static const struct rpc_call_ops nfs_write_full_ops; @@ -194,6 +153,13 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c i_size_write(inode, end); } +/* A writeback failed: mark the page as bad, and invalidate the page cache */ +static void nfs_set_pageerror(struct page *page) +{ + SetPageError(page); + nfs_zap_mapping(page->mapping->host, page->mapping); +} + /* We can set the PG_uptodate flag if we see that a write request * covers the full page. */ @@ -323,7 +289,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc err = 0; out: if (!wbc->for_writepages) - nfs_flush_mapping(page->mapping, wbc, wb_priority(wbc)); + nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); return err; } @@ -360,14 +326,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) if (err < 0) goto out; nfs_add_stats(inode, NFSIOS_WRITEPAGES, err); - if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) { - err = nfs_wait_on_requests(inode, 0, 0); - if (err < 0) - goto out; - } - err = nfs_commit_inode(inode, wb_priority(wbc)); - if (err > 0) - err = 0; + err = 0; out: clear_bit(BDI_write_congested, &bdi->state); wake_up_all(&nfs_write_congestion); @@ -516,17 +475,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st return res; } -static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int npages) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int ret; - - spin_lock(&nfsi->req_lock); - ret = nfs_wait_on_requests_locked(inode, idx_start, npages); - spin_unlock(&nfsi->req_lock); - return ret; -} - static void nfs_cancel_dirty_list(struct list_head *head) { struct nfs_page *req; @@ -773,7 +721,7 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); if (status < 0) - ClearPageUptodate(page); + nfs_set_pageerror(page); return status; } @@ -852,7 +800,8 @@ static void nfs_write_rpcsetup(struct nfs_page *req, data->task.tk_priority = flush_task_priority(how); data->task.tk_cookie = (unsigned long)inode; - dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", + dprintk("NFS: %5u initiated write call " + "(req %s/%Ld, %u bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -1034,8 +983,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) return; if (task->tk_status < 0) { - ClearPageUptodate(page); - SetPageError(page); + nfs_set_pageerror(page); req->wb_context->error = task->tk_status; dprintk(", error = %d\n", task->tk_status); } else { @@ -1092,8 +1040,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) (long long)req_offset(req)); if (task->tk_status < 0) { - ClearPageUptodate(page); - SetPageError(page); + nfs_set_pageerror(page); req->wb_context->error = task->tk_status; end_page_writeback(page); nfs_inode_remove_request(req); @@ -1134,7 +1081,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) struct nfs_writeres *resp = &data->res; int status; - dprintk("NFS: %4d nfs_writeback_done (status %d)\n", + dprintk("NFS: %5u nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); /* @@ -1250,7 +1197,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, data->task.tk_priority = flush_task_priority(how); data->task.tk_cookie = (unsigned long)inode; - dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); } /* @@ -1291,7 +1238,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) struct nfs_write_data *data = calldata; struct nfs_page *req; - dprintk("NFS: %4d nfs_commit_done (status %d)\n", + dprintk("NFS: %5u nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); /* Call the NFS version-specific code */ @@ -1516,6 +1463,8 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how) if (ret < 0) goto out; } + if (!PagePrivate(page)) + return 0; ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); if (ret >= 0) return 0; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 49c310b8492..6f24768272a 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -16,7 +16,6 @@ #include <linux/unistd.h> #include <linux/slab.h> -#include <linux/sched.h> #include <linux/stat.h> #include <linux/in.h> #include <linux/seq_file.h> @@ -190,18 +189,17 @@ static int expkey_show(struct seq_file *m, struct cache_head *h) { struct svc_expkey *ek ; + int i; if (h ==NULL) { seq_puts(m, "#domain fsidtype fsid [path]\n"); return 0; } ek = container_of(h, struct svc_expkey, h); - seq_printf(m, "%s %d 0x%08x", ek->ek_client->name, - ek->ek_fsidtype, ek->ek_fsid[0]); - if (ek->ek_fsidtype != 1) - seq_printf(m, "%08x", ek->ek_fsid[1]); - if (ek->ek_fsidtype == 2) - seq_printf(m, "%08x", ek->ek_fsid[2]); + seq_printf(m, "%s %d 0x", ek->ek_client->name, + ek->ek_fsidtype); + for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) + seq_printf(m, "%08x", ek->ek_fsid[i]); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); @@ -232,9 +230,8 @@ static inline void expkey_init(struct cache_head *cnew, kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; - new->ek_fsid[0] = item->ek_fsid[0]; - new->ek_fsid[1] = item->ek_fsid[1]; - new->ek_fsid[2] = item->ek_fsid[2]; + + memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); } static inline void expkey_update(struct cache_head *cnew, @@ -363,7 +360,7 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int flags) +static int check_export(struct inode *inode, int flags, unsigned char *uuid) { /* We currently export only dirs and regular files. @@ -376,12 +373,13 @@ static int check_export(struct inode *inode, int flags) /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) - * or an FSID number (so NFSEXP_FSID needed). + * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && - !(flags & NFSEXP_FSID)) { + !(flags & NFSEXP_FSID) && + uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } @@ -406,10 +404,6 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int len; int migrated, i, err; - len = qword_get(mesg, buf, PAGE_SIZE); - if (len != 5 || memcmp(buf, "fsloc", 5)) - return 0; - /* listsize */ err = get_int(mesg, &fsloc->locations_count); if (err) @@ -520,6 +514,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_fslocs.locations_count = 0; exp.ex_fslocs.migrated = 0; + exp.ex_uuid = NULL; + /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) @@ -543,12 +539,33 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err) goto out; exp.ex_fsid = an_int; - err = check_export(nd.dentry->d_inode, exp.ex_flags); - if (err) goto out; + while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + if (strcmp(buf, "fsloc") == 0) + err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); + else if (strcmp(buf, "uuid") == 0) { + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len != 16) + err = -EINVAL; + else { + exp.ex_uuid = + kmemdup(buf, 16, GFP_KERNEL); + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } + } else + /* quietly ignore unknown words and anything + * following. Newer user-space can try to set + * new values, then see what the result was. + */ + break; + if (err) + goto out; + } - err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); - if (err) - goto out; + err = check_export(nd.dentry->d_inode, exp.ex_flags, + exp.ex_uuid); + if (err) goto out; } expp = svc_export_lookup(&exp); @@ -562,6 +579,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) else exp_put(expp); out: + nfsd4_fslocs_free(&exp.ex_fslocs); + kfree(exp.ex_uuid); kfree(exp.ex_path); if (nd.dentry) path_release(&nd); @@ -591,9 +610,19 @@ static int svc_export_show(struct seq_file *m, seq_escape(m, exp->ex_client->name, " \t\n\\"); seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && - !test_bit(CACHE_NEGATIVE, &h->flags)) + !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); + if (exp->ex_uuid) { + int i; + seq_puts(m, ",uuid="); + for (i=0; i<16; i++) { + if ((i&3) == 0 && i) + seq_putc(m, ':'); + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } + } seq_puts(m, ")\n"); return 0; } @@ -630,6 +659,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; new->ex_path = item->ex_path; item->ex_path = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; @@ -752,11 +783,11 @@ exp_get_key(svc_client *clp, dev_t dev, ino_t ino) u32 fsidv[3]; if (old_valid_dev(dev)) { - mk_fsid_v0(fsidv, dev, ino); - return exp_find_key(clp, 0, fsidv, NULL); + mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL); + return exp_find_key(clp, FSID_DEV, fsidv, NULL); } - mk_fsid_v3(fsidv, dev, ino); - return exp_find_key(clp, 3, fsidv, NULL); + mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL); + return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL); } /* @@ -767,9 +798,9 @@ exp_get_fsid_key(svc_client *clp, int fsid) { u32 fsidv[2]; - mk_fsid_v1(fsidv, fsid); + mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL); - return exp_find_key(clp, 1, fsidv, NULL); + return exp_find_key(clp, FSID_NUM, fsidv, NULL); } svc_export * @@ -883,8 +914,8 @@ static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) if ((exp->ex_flags & NFSEXP_FSID) == 0) return 0; - mk_fsid_v1(fsid, exp->ex_fsid); - return exp_set_key(clp, 1, fsid, exp); + mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL); + return exp_set_key(clp, FSID_NUM, fsid, exp); } static int exp_hash(struct auth_domain *clp, struct svc_export *exp) @@ -894,11 +925,11 @@ static int exp_hash(struct auth_domain *clp, struct svc_export *exp) dev_t dev = inode->i_sb->s_dev; if (old_valid_dev(dev)) { - mk_fsid_v0(fsid, dev, inode->i_ino); - return exp_set_key(clp, 0, fsid, exp); + mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL); + return exp_set_key(clp, FSID_DEV, fsid, exp); } - mk_fsid_v3(fsid, dev, inode->i_ino); - return exp_set_key(clp, 3, fsid, exp); + mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL); + return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp); } static void exp_unhash(struct svc_export *exp) @@ -977,7 +1008,7 @@ exp_export(struct nfsctl_export *nxp) goto finish; } - err = check_export(nd.dentry->d_inode, nxp->ex_flags); + err = check_export(nd.dentry->d_inode, nxp->ex_flags, NULL); if (err) goto finish; err = -ENOMEM; @@ -1170,9 +1201,9 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, __be32 rv; u32 fsidv[2]; - mk_fsid_v1(fsidv, 0); + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - exp = exp_find(clp, 1, fsidv, creq); + exp = exp_find(clp, FSID_NUM, fsidv, creq); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); if (exp == NULL) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index edde5dc5f79..b6174288501 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -287,13 +287,20 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p, return 1; } -static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p, - struct nfsd_fhandle *resp) +static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p, + struct nfsd_attrstat *resp) { fh_put(&resp->fh); return 1; } +static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p, + struct nfsd3_accessres *resp) +{ + fh_put(&resp->fh); + return 1; +} + #define nfsaclsvc_decode_voidargs NULL #define nfsaclsvc_encode_voidres NULL #define nfsaclsvc_release_void NULL @@ -322,9 +329,9 @@ struct nfsd3_voidargs { int dummy; }; static struct svc_procedure nfsd_acl_procedures2[] = { PROC(null, void, void, void, RC_NOCACHE, ST), PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, fhandle, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, fhandle, RC_NOCACHE, ST+AT), - PROC(access, access, access, fhandle, RC_NOCACHE, ST+AT+1), + PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), + PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), }; struct svc_version nfsd_acl_version2 = { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e695660921e..6f677988c71 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -149,6 +149,27 @@ decode_sattr3(__be32 *p, struct iattr *iap) return p; } +static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +{ + u64 f; + switch(fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + p = xdr_encode_hyper(p, (u64)huge_encode_dev + (fhp->fh_dentry->d_inode->i_sb->s_dev)); + break; + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u64*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; + p = xdr_encode_hyper(p, f); + break; + } + return p; +} + static __be32 * encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) @@ -169,10 +190,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); *p++ = htonl((u32) MAJOR(stat->rdev)); *p++ = htonl((u32) MINOR(stat->rdev)); - if (is_fsid(fhp, rqstp->rq_reffh)) - p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); - else - p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev)); + p = encode_fsid(p, fhp); p = xdr_encode_hyper(p, (u64) stat->ino); p = encode_time3(p, &stat->atime); lease_get_mtime(dentry->d_inode, &time); @@ -203,10 +221,7 @@ encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) p = xdr_encode_hyper(p, ((u64)fhp->fh_post_blocks) << 9); *p++ = fhp->fh_post_rdev[0]; *p++ = fhp->fh_post_rdev[1]; - if (is_fsid(fhp, rqstp->rq_reffh)) - p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); - else - p = xdr_encode_hyper(p, (u64)huge_encode_dev(inode->i_sb->s_dev)); + p = encode_fsid(p, fhp); p = xdr_encode_hyper(p, (u64) inode->i_ino); p = encode_time3(p, &fhp->fh_post_atime); p = encode_time3(p, &fhp->fh_post_mtime); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 5d94555cdc8..832673b1458 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -61,9 +61,11 @@ /* flags used to simulate posix default ACLs */ #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ - | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) + | NFS4_ACE_DIRECTORY_INHERIT_ACE) -#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP) +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \ + | NFS4_ACE_INHERIT_ONLY_ACE \ + | NFS4_ACE_IDENTIFIER_GROUP) #define MASK_EQUAL(mask1, mask2) \ ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) @@ -87,12 +89,19 @@ mask_from_posix(unsigned short perm, unsigned int flags) } static u32 -deny_mask(u32 allow_mask, unsigned int flags) +deny_mask_from_posix(unsigned short perm, u32 flags) { - u32 ret = ~allow_mask & ~NFS4_MASK_UNSUPP; - if (!(flags & NFS4_ACL_DIR)) - ret &= ~NFS4_ACE_DELETE_CHILD; - return ret; + u32 mask = 0; + + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; } /* XXX: modify functions to return NFS errors; they're only ever @@ -126,108 +135,151 @@ struct ace_container { }; static short ace2type(struct nfs4_ace *); -static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); -static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int); -int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); -static int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *); +static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, + unsigned int); +void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); struct nfs4_acl * nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, unsigned int flags) { struct nfs4_acl *acl; - int error = -EINVAL; + int size = 0; - if ((pacl != NULL && - (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) || - (dpacl != NULL && - (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0))) - goto out_err; - - acl = nfs4_acl_new(); - if (acl == NULL) { - error = -ENOMEM; - goto out_err; + if (pacl) { + if (posix_acl_valid(pacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*pacl->a_count; } - - if (pacl != NULL) { - error = _posix_to_nfsv4_one(pacl, acl, - flags & ~NFS4_ACL_TYPE_DEFAULT); - if (error < 0) - goto out_acl; + if (dpacl) { + if (posix_acl_valid(dpacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*dpacl->a_count; } - if (dpacl != NULL) { - error = _posix_to_nfsv4_one(dpacl, acl, - flags | NFS4_ACL_TYPE_DEFAULT); - if (error < 0) - goto out_acl; - } + /* Allocate for worst case: one (deny, allow) pair each: */ + acl = nfs4_acl_new(size); + if (acl == NULL) + return ERR_PTR(-ENOMEM); - return acl; + if (pacl) + _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); -out_acl: - nfs4_acl_free(acl); -out_err: - acl = ERR_PTR(error); + if (dpacl) + _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); return acl; } -static int -nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype, - uid_t owner, unsigned int flags) +struct posix_acl_summary { + unsigned short owner; + unsigned short users; + unsigned short group; + unsigned short groups; + unsigned short other; + unsigned short mask; +}; + +static void +summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) { - int error; - - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - eflag, mask, whotype, owner); - if (error < 0) - return error; - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - eflag, deny_mask(mask, flags), whotype, owner); - return error; + struct posix_acl_entry *pa, *pe; + pas->users = 0; + pas->groups = 0; + pas->mask = 07; + + pe = acl->a_entries + acl->a_count; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pas->owner = pa->e_perm; + break; + case ACL_GROUP_OBJ: + pas->group = pa->e_perm; + break; + case ACL_USER: + pas->users |= pa->e_perm; + break; + case ACL_GROUP: + pas->groups |= pa->e_perm; + break; + case ACL_OTHER: + pas->other = pa->e_perm; + break; + case ACL_MASK: + pas->mask = pa->e_perm; + break; + } + } + /* We'll only care about effective permissions: */ + pas->users &= pas->mask; + pas->group &= pas->mask; + pas->groups &= pas->mask; } /* We assume the acl has been verified with posix_acl_valid. */ -static int +static void _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, unsigned int flags) { - struct posix_acl_entry *pa, *pe, *group_owner_entry; - int error = -EINVAL; - u32 mask, mask_mask; + struct posix_acl_entry *pa, *group_owner_entry; + struct nfs4_ace *ace; + struct posix_acl_summary pas; + unsigned short deny; int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? NFS4_INHERITANCE_FLAGS : 0); BUG_ON(pacl->a_count < 3); - pe = pacl->a_entries + pacl->a_count; - pa = pe - 2; /* if mask entry exists, it's second from the last. */ - if (pa->e_tag == ACL_MASK) - mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags); - else - mask_mask = 0; + summarize_posix_acl(pacl, &pas); pa = pacl->a_entries; - BUG_ON(pa->e_tag != ACL_USER_OBJ); - mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); - error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags); - if (error < 0) - goto out; - pa++; + ace = acl->aces + acl->naces; - while (pa->e_tag == ACL_USER) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - eflag, mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + /* We could deny everything not granted by the owner: */ + deny = ~pas.owner; + /* + * but it is equivalent (and simpler) to deny only what is not + * granted by later entries: + */ + deny &= pas.users | pas.group | pas.groups | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + pa++; - error = nfs4_acl_add_pair(acl, eflag, mask, - NFS4_ACL_WHO_NAMED, pa->e_id, flags); - if (error < 0) - goto out; + while (pa->e_tag == ACL_USER) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.groups | pas.group | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; pa++; } @@ -236,67 +288,65 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, /* allow ACEs */ - if (pacl->a_count > 3) { - BUG_ON(pa->e_tag != ACL_GROUP_OBJ); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, - NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; - } group_owner_entry = pa; - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, - NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pas.group, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; pa++; while (pa->e_tag == ACL_GROUP) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, - NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; - - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, - NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; pa++; } /* deny ACEs */ pa = group_owner_entry; - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, - deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; + + deny = ~pas.group & pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + } pa++; + while (pa->e_tag == ACL_GROUP) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, - deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + deny = ~(pa->e_perm & pas.mask); + deny &= pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } pa++; } if (pa->e_tag == ACL_MASK) pa++; - BUG_ON(pa->e_tag != ACL_OTHER); - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags); - -out: - return error; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags); + ace->whotype = NFS4_ACL_WHO_EVERYONE; + acl->naces++; } static void @@ -342,46 +392,6 @@ sort_pacl(struct posix_acl *pacl) return; } -int -nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, - struct posix_acl **dpacl, unsigned int flags) -{ - struct nfs4_acl *dacl; - int error = -ENOMEM; - - *pacl = NULL; - *dpacl = NULL; - - dacl = nfs4_acl_new(); - if (dacl == NULL) - goto out; - - error = nfs4_acl_split(acl, dacl); - if (error) - goto out_acl; - - *pacl = _nfsv4_to_posix_one(acl, flags); - if (IS_ERR(*pacl)) { - error = PTR_ERR(*pacl); - *pacl = NULL; - goto out_acl; - } - - *dpacl = _nfsv4_to_posix_one(dacl, flags); - if (IS_ERR(*dpacl)) { - error = PTR_ERR(*dpacl); - *dpacl = NULL; - } -out_acl: - if (error) { - posix_acl_release(*pacl); - *pacl = NULL; - } - nfs4_acl_free(dacl); -out: - return error; -} - /* * While processing the NFSv4 ACE, this maintains bitmasks representing * which permission bits have been allowed and which denied to a given @@ -406,6 +416,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { + int empty; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -421,6 +432,7 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); + state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't no which, so we allocate @@ -488,6 +500,20 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) int nace; int i, error = 0; + /* + * ACLs with no ACEs are treated differently in the inheritable + * and effective cases: when there are no inheritable ACEs, we + * set a zero-length default posix acl: + */ + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { + pacl = posix_acl_alloc(0, GFP_KERNEL); + return pacl ? pacl : ERR_PTR(-ENOMEM); + } + /* + * When there are no effective ACEs, the following will end + * up setting a 3-element effective posix ACL with all + * permissions zero. + */ nace = 4 + state->users->n + state->groups->n; pacl = posix_acl_alloc(nace, GFP_KERNEL); if (!pacl) @@ -603,6 +629,8 @@ static void process_one_v4_ace(struct posix_acl_state *state, u32 mask = ace->access_mask; int i; + state->empty = 0; + switch (ace2type(ace)) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { @@ -666,75 +694,62 @@ static void process_one_v4_ace(struct posix_acl_state *state, } } -static struct posix_acl * -_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, + struct posix_acl **dpacl, unsigned int flags) { - struct posix_acl_state state; - struct posix_acl *pacl; + struct posix_acl_state effective_acl_state, default_acl_state; struct nfs4_ace *ace; int ret; - ret = init_state(&state, n4acl->naces); + ret = init_state(&effective_acl_state, acl->naces); if (ret) - return ERR_PTR(ret); - - list_for_each_entry(ace, &n4acl->ace_head, l_ace) - process_one_v4_ace(&state, ace); - - pacl = posix_state_to_acl(&state, flags); - - free_state(&state); - - if (!IS_ERR(pacl)) - sort_pacl(pacl); - return pacl; -} - -static int -nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) -{ - struct list_head *h, *n; - struct nfs4_ace *ace; - int error = 0; - - list_for_each_safe(h, n, &acl->ace_head) { - ace = list_entry(h, struct nfs4_ace, l_ace); - + return ret; + ret = init_state(&default_acl_state, acl->naces); + if (ret) + goto out_estate; + ret = -EINVAL; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) - return -EINVAL; - + goto out_dstate; if (ace->flag & ~NFS4_SUPPORTED_FLAGS) - return -EINVAL; - - switch (ace->flag & NFS4_INHERITANCE_FLAGS) { - case 0: - /* Leave this ace in the effective acl: */ + goto out_dstate; + if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) { + process_one_v4_ace(&effective_acl_state, ace); continue; - case NFS4_INHERITANCE_FLAGS: - /* Add this ace to the default acl and remove it - * from the effective acl: */ - error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who); - if (error) - return error; - list_del(h); - kfree(ace); - acl->naces--; - break; - case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE: - /* Add this ace to the default, but leave it in - * the effective acl as well: */ - error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who); - if (error) - return error; - break; - default: - return -EINVAL; } + if (!(flags & NFS4_ACL_DIR)) + goto out_dstate; + /* + * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT + * is set, we're effectively turning on the other. That's OK, + * according to rfc 3530. + */ + process_one_v4_ace(&default_acl_state, ace); + + if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) + process_one_v4_ace(&effective_acl_state, ace); } - return 0; + *pacl = posix_state_to_acl(&effective_acl_state, flags); + if (IS_ERR(*pacl)) { + ret = PTR_ERR(*pacl); + goto out_dstate; + } + *dpacl = posix_state_to_acl(&default_acl_state, + flags | NFS4_ACL_TYPE_DEFAULT); + if (IS_ERR(*dpacl)) { + ret = PTR_ERR(*dpacl); + posix_acl_release(*pacl); + goto out_dstate; + } + sort_pacl(*pacl); + sort_pacl(*dpacl); + ret = 0; +out_dstate: + free_state(&default_acl_state); +out_estate: + free_state(&effective_acl_state); + return ret; } static short @@ -759,48 +774,22 @@ EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); struct nfs4_acl * -nfs4_acl_new(void) +nfs4_acl_new(int n) { struct nfs4_acl *acl; - if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL) + acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL); + if (acl == NULL) return NULL; - acl->naces = 0; - INIT_LIST_HEAD(&acl->ace_head); - return acl; } void -nfs4_acl_free(struct nfs4_acl *acl) -{ - struct list_head *h; - struct nfs4_ace *ace; - - if (!acl) - return; - - while (!list_empty(&acl->ace_head)) { - h = acl->ace_head.next; - list_del(h); - ace = list_entry(h, struct nfs4_ace, l_ace); - kfree(ace); - } - - kfree(acl); - - return; -} - -int nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, int whotype, uid_t who) { - struct nfs4_ace *ace; - - if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -ENOMEM; + struct nfs4_ace *ace = acl->aces + acl->naces; ace->type = type; ace->flag = flag; @@ -808,10 +797,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, ace->whotype = whotype; ace->who = who; - list_add_tail(&ace->l_ace, &acl->ace_head); acl->naces++; - - return 0; } static struct { @@ -865,7 +851,6 @@ nfs4_acl_write_who(int who, char *p) } EXPORT_SYMBOL(nfs4_acl_new); -EXPORT_SYMBOL(nfs4_acl_free); EXPORT_SYMBOL(nfs4_acl_add_ace); EXPORT_SYMBOL(nfs4_acl_get_whotype); EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f57655a7a2b..fb14d68eaca 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -387,7 +387,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) .address = (struct sockaddr *)&addr, .addrsize = sizeof(addr), .timeout = &timeparms, - .servername = clp->cl_name.data, .program = program, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ @@ -397,6 +396,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, }; + char clientname[16]; int status; if (atomic_read(&cb->cb_set)) @@ -419,6 +419,11 @@ nfsd4_probe_callback(struct nfs4_client *clp) memset(program->stats, 0, sizeof(cb->cb_stat)); program->stats->program = program; + /* Just here to make some printk's more useful: */ + snprintf(clientname, sizeof(clientname), + "%u.%u.%u.%u", NIPQUAD(addr.sin_addr)); + args.servername = clientname; + /* Create RPC client */ cb->cb_client = rpc_create(&args); if (IS_ERR(cb->cb_client)) { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index b1902ebaab4..e4a83d727af 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -50,7 +50,6 @@ #include <linux/sunrpc/cache.h> #include <linux/nfsd_idmap.h> #include <linux/list.h> -#include <linux/sched.h> #include <linux/time.h> #include <linux/seq_file.h> #include <linux/sunrpc/svcauth.h> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9de89df961f..9e406799920 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -714,7 +714,7 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; + struct sockaddr_in *sin = svc_addr_in(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -749,7 +749,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ status = nfserr_clid_inuse; if (!cmp_creds(&conf->cl_cred, &rqstp->rq_cred) - || conf->cl_addr != ip_addr) { + || conf->cl_addr != sin->sin_addr.s_addr) { printk("NFSD: setclientid: string in use by client" "(clientid %08x/%08x)\n", conf->cl_clientid.cl_boot, conf->cl_clientid.cl_id); @@ -769,7 +769,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; copy_verf(new, &clverifier); - new->cl_addr = ip_addr; + new->cl_addr = sin->sin_addr.s_addr; copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); gen_confirm(new); @@ -801,7 +801,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; copy_verf(new,&conf->cl_verifier); - new->cl_addr = ip_addr; + new->cl_addr = sin->sin_addr.s_addr; copy_cred(&new->cl_cred,&rqstp->rq_cred); copy_clid(new, conf); gen_confirm(new); @@ -820,7 +820,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; copy_verf(new,&clverifier); - new->cl_addr = ip_addr; + new->cl_addr = sin->sin_addr.s_addr; copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); gen_confirm(new); @@ -847,7 +847,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; copy_verf(new,&clverifier); - new->cl_addr = ip_addr; + new->cl_addr = sin->sin_addr.s_addr; copy_cred(&new->cl_cred,&rqstp->rq_cred); gen_clid(new); gen_confirm(new); @@ -881,7 +881,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid_confirm *setclientid_confirm) { - __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; + struct sockaddr_in *sin = svc_addr_in(rqstp); struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; @@ -900,9 +900,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unconf = find_unconfirmed_client(clid); status = nfserr_clid_inuse; - if (conf && conf->cl_addr != ip_addr) + if (conf && conf->cl_addr != sin->sin_addr.s_addr) goto out; - if (unconf && unconf->cl_addr != ip_addr) + if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) goto out; if ((conf && unconf) && diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 18aa9440df1..5d090f11f2b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -199,24 +199,22 @@ defer_free(struct nfsd4_compoundargs *argp, static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { - void *new = NULL; if (p == argp->tmp) { - new = kmalloc(nbytes, GFP_KERNEL); - if (!new) return NULL; - p = new; + p = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; memcpy(p, argp->tmp, nbytes); } else { BUG_ON(p != argp->tmpp); argp->tmpp = NULL; } if (defer_free(argp, kfree, p)) { - kfree(new); + kfree(p); return NULL; } else return (char *)p; } - static __be32 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) { @@ -255,7 +253,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia return status; /* - * According to spec, unsupported attributes return ERR_NOTSUPP; + * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; * read-only attributes return ERR_INVAL. */ if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) @@ -273,42 +271,42 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - int nace, i; - struct nfs4_ace ace; + int nace; + struct nfs4_ace *ace; READ_BUF(4); len += 4; READ32(nace); - *acl = nfs4_acl_new(); + if (nace > NFS4_ACL_MAX) + return nfserr_resource; + + *acl = nfs4_acl_new(nace); if (*acl == NULL) { host_err = -ENOMEM; goto out_nfserr; } - defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl); + defer_free(argp, kfree, *acl); - for (i = 0; i < nace; i++) { + (*acl)->naces = nace; + for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; - READ32(ace.type); - READ32(ace.flag); - READ32(ace.access_mask); + READ32(ace->type); + READ32(ace->flag); + READ32(ace->access_mask); READ32(dummy32); READ_BUF(dummy32); len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); - ace.whotype = nfs4_acl_get_whotype(buf, dummy32); + ace->whotype = nfs4_acl_get_whotype(buf, dummy32); host_err = 0; - if (ace.whotype != NFS4_ACL_WHO_NAMED) - ace.who = 0; - else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP) + if (ace->whotype != NFS4_ACL_WHO_NAMED) + ace->who = 0; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) host_err = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace.who); + buf, dummy32, &ace->who); else host_err = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace.who); - if (host_err) - goto out_nfserr; - host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who); + buf, dummy32, &ace->who); if (host_err) goto out_nfserr; } @@ -1563,14 +1561,20 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (exp->ex_fslocs.migrated) { WRITE64(NFS4_REFERRAL_FSID_MAJOR); WRITE64(NFS4_REFERRAL_FSID_MINOR); - } else if (is_fsid(fhp, rqstp->rq_reffh)) { + } else switch(fsid_source(fhp)) { + case FSIDSOURCE_FSID: WRITE64((u64)exp->ex_fsid); WRITE64((u64)0); - } else { + break; + case FSIDSOURCE_DEV: WRITE32(0); WRITE32(MAJOR(stat.dev)); WRITE32(0); WRITE32(MINOR(stat.dev)); + break; + case FSIDSOURCE_UUID: + WRITEMEM(exp->ex_uuid, 16); + break; } } if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { @@ -1590,7 +1594,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; - struct list_head *h; if (acl == NULL) { if ((buflen -= 4) < 0) @@ -1603,9 +1606,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_resource; WRITE32(acl->naces); - list_for_each(h, &acl->ace_head) { - ace = list_entry(h, struct nfs4_ace, l_ace); - + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { if ((buflen -= 4*3) < 0) goto out_resource; WRITE32(ace->type); @@ -1815,7 +1816,7 @@ out_acl: status = nfs_ok; out: - nfs4_acl_free(acl); + kfree(acl); if (fhp == &tempfh) fh_put(&tempfh); return status; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index f90d7047585..578f2c9d56b 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -185,7 +185,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type) rp->c_state = RC_INPROG; rp->c_xid = xid; rp->c_proc = proc; - rp->c_addr = rqstp->rq_addr; + memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr)); rp->c_prot = proto; rp->c_vers = vers; rp->c_timestamp = jiffies; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index eedf2e3990a..71c686dc725 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -123,7 +123,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu return PTR_ERR(data); rv = write_op[ino](file, data, size); - if (rv>0) { + if (rv >= 0) { simple_transaction_set(file, rv); rv = size; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c59d6fbb7a6..c2660cbfcd9 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -9,7 +9,6 @@ * ... and again Southern-Winter 2001 to support export_operations */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -20,6 +19,7 @@ #include <linux/mount.h> #include <asm/pgtable.h> +#include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> @@ -118,9 +118,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); - /* keep this filehandle for possible reference when encoding attributes */ - rqstp->rq_reffh = fh; - if (!fhp->fh_dentry) { __u32 *datap=NULL; __u32 tfh[3]; /* filehandle fragment for oldstyle filehandles */ @@ -145,10 +142,10 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) } len = key_len(fh->fh_fsid_type) / 4; if (len == 0) goto out; - if (fh->fh_fsid_type == 2) { + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { /* deprecated, convert to type 3 */ - len = 3; - fh->fh_fsid_type = 3; + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); fh->fh_fsid[1] = fh->fh_fsid[2]; } @@ -163,8 +160,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) /* assume old filehandle format */ xdev = old_decode_dev(fh->ofh_xdev); xino = u32_to_ino_t(fh->ofh_xino); - mk_fsid_v0(tfh, xdev, xino); - exp = exp_find(rqstp->rq_client, 0, tfh, &rqstp->rq_chandle); + mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); + exp = exp_find(rqstp->rq_client, FSID_DEV, tfh, + &rqstp->rq_chandle); } if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN @@ -180,10 +178,10 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) /* Check if the request originated from a secure port. */ error = nfserr_perm; if (!rqstp->rq_secure && EX_SECURE(exp)) { + char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING - "nfsd: request from insecure port (%u.%u.%u.%u:%d)!\n", - NIPQUAD(rqstp->rq_addr.sin_addr.s_addr), - ntohs(rqstp->rq_addr.sin_port)); + "nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); goto out; } @@ -211,7 +209,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) fileid_type = 2; } else fileid_type = fh->fh_fileid_type; - + if (fileid_type == 0) dentry = dget(exp->ex_dentry); else { @@ -291,7 +289,7 @@ static inline int _fh_update(struct dentry *dentry, struct svc_export *exp, __u32 *datap, int *maxsize) { struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op; - + if (dentry == exp->ex_dentry) { *maxsize = 0; return 0; @@ -316,7 +314,8 @@ static inline void _fh_update_old(struct dentry *dentry, } __be32 -fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh) +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) { /* ref_fh is a reference file handle. * if it is non-null and for the same filesystem, then we should compose @@ -326,12 +325,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st * */ - u8 ref_fh_version = 0; - u8 ref_fh_fsid_type = 0; + u8 version = 1; + u8 fsid_type = 0; struct inode * inode = dentry->d_inode; struct dentry *parent = dentry->d_parent; __u32 *datap; dev_t ex_dev = exp->ex_dentry->d_inode->i_sb->s_dev; + int root_export = (exp->ex_dentry == exp->ex_dentry->d_sb->s_root); dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", MAJOR(ex_dev), MINOR(ex_dev), @@ -339,57 +339,64 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st parent->d_name.name, dentry->d_name.name, (inode ? inode->i_ino : 0)); + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ if (ref_fh && ref_fh->fh_export == exp) { - ref_fh_version = ref_fh->fh_handle.fh_version; - if (ref_fh_version == 0xca) - ref_fh_fsid_type = 0; + version = ref_fh->fh_handle.fh_version; + if (version == 0xca) + fsid_type = FSID_DEV; else - ref_fh_fsid_type = ref_fh->fh_handle.fh_fsid_type; - if (ref_fh_fsid_type > 3) - ref_fh_fsid_type = 0; - - /* make sure ref_fh type works for given export */ - if (ref_fh_fsid_type == 1 && - !(exp->ex_flags & NFSEXP_FSID)) { - /* if we don't have an fsid, we cannot provide one... */ - ref_fh_fsid_type = 0; + fsid_type = ref_fh->fh_handle.fh_fsid_type; + /* We know this version/type works for this export + * so there is no need for further checks. + */ + } else if (exp->ex_uuid) { + if (fhp->fh_maxsize >= 64) { + if (root_export) + fsid_type = FSID_UUID16; + else + fsid_type = FSID_UUID16_INUM; + } else { + if (root_export) + fsid_type = FSID_UUID8; + else + fsid_type = FSID_UUID4_INUM; } } else if (exp->ex_flags & NFSEXP_FSID) - ref_fh_fsid_type = 1; - - if (!old_valid_dev(ex_dev) && ref_fh_fsid_type == 0) { + fsid_type = FSID_NUM; + else if (!old_valid_dev(ex_dev)) /* for newer device numbers, we must use a newer fsid format */ - ref_fh_version = 1; - ref_fh_fsid_type = 3; - } - if (old_valid_dev(ex_dev) && - (ref_fh_fsid_type == 2 || ref_fh_fsid_type == 3)) - /* must use type1 for smaller device numbers */ - ref_fh_fsid_type = 0; + fsid_type = FSID_ENCODE_DEV; + else + fsid_type = FSID_DEV; if (ref_fh == fhp) fh_put(ref_fh); if (fhp->fh_locked || fhp->fh_dentry) { printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n", - parent->d_name.name, dentry->d_name.name); + parent->d_name.name, dentry->d_name.name); } if (fhp->fh_maxsize < NFS_FHSIZE) printk(KERN_ERR "fh_compose: called with maxsize %d! %s/%s\n", - fhp->fh_maxsize, parent->d_name.name, dentry->d_name.name); + fhp->fh_maxsize, + parent->d_name.name, dentry->d_name.name); fhp->fh_dentry = dget(dentry); /* our internal copy */ fhp->fh_export = exp; cache_get(&exp->h); - if (ref_fh_version == 0xca) { + if (version == 0xca) { /* old style filehandle please */ memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; fhp->fh_handle.ofh_dcookie = 0xfeebbaca; fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; - fhp->fh_handle.ofh_xino = ino_t_to_u32(exp->ex_dentry->d_inode->i_ino); + fhp->fh_handle.ofh_xino = + ino_t_to_u32(exp->ex_dentry->d_inode->i_ino); fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); @@ -398,38 +405,12 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st fhp->fh_handle.fh_version = 1; fhp->fh_handle.fh_auth_type = 0; datap = fhp->fh_handle.fh_auth+0; - fhp->fh_handle.fh_fsid_type = ref_fh_fsid_type; - switch (ref_fh_fsid_type) { - case 0: - /* - * fsid_type 0: - * 2byte major, 2byte minor, 4byte inode - */ - mk_fsid_v0(datap, ex_dev, - exp->ex_dentry->d_inode->i_ino); - break; - case 1: - /* fsid_type 1 == 4 bytes filesystem id */ - mk_fsid_v1(datap, exp->ex_fsid); - break; - case 2: - /* - * fsid_type 2: - * 4byte major, 4byte minor, 4byte inode - */ - mk_fsid_v2(datap, ex_dev, - exp->ex_dentry->d_inode->i_ino); - break; - case 3: - /* - * fsid_type 3: - * 4byte devicenumber, 4byte inode - */ - mk_fsid_v3(datap, ex_dev, - exp->ex_dentry->d_inode->i_ino); - break; - } - len = key_len(ref_fh_fsid_type); + fhp->fh_handle.fh_fsid_type = fsid_type; + mk_fsid(fsid_type, datap, ex_dev, + exp->ex_dentry->d_inode->i_ino, + exp->ex_fsid, exp->ex_uuid); + + len = key_len(fsid_type); datap += len/4; fhp->fh_handle.fh_size = 4 + len; @@ -456,7 +437,7 @@ fh_update(struct svc_fh *fhp) { struct dentry *dentry; __u32 *datap; - + if (!fhp->fh_dentry) goto out_bad; @@ -533,3 +514,22 @@ char * SVCFH_fmt(struct svc_fh *fhp) fh->fh_base.fh_pad[5]); return buf; } + +enum fsid_source fsid_source(struct svc_fh *fhp) +{ + if (fhp->fh_handle.fh_version != 1) + return FSIDSOURCE_DEV; + switch(fhp->fh_handle.fh_fsid_type) { + case FSID_DEV: + case FSID_ENCODE_DEV: + case FSID_MAJOR_MINOR: + return FSIDSOURCE_DEV; + case FSID_NUM: + return FSIDSOURCE_FSID; + default: + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + else + return FSIDSOURCE_UUID; + } +} diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index ec983b77768..5cc2eec981b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -19,6 +19,7 @@ #include <linux/unistd.h> #include <linux/slab.h> +#include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/cache.h> @@ -147,10 +148,10 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, */ if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { + char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_NOTICE - "oversized read request from %u.%u.%u.%u:%d (%d bytes)\n", - NIPQUAD(rqstp->rq_addr.sin_addr.s_addr), - ntohs(rqstp->rq_addr.sin_port), + "oversized read request from %s (%d bytes)\n", + svc_print_addr(rqstp, buf, sizeof(buf)), argp->count); argp->count = NFSSVC_MAXBLKSIZE_V2; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fbf5d51947e..d7759ce6ed9 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -235,7 +235,8 @@ static int nfsd_init_socks(int port) error = lockd_up(IPPROTO_UDP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); + error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, + SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); } @@ -245,7 +246,8 @@ static int nfsd_init_socks(int port) #ifdef CONFIG_NFSD_TCP error = lockd_up(IPPROTO_TCP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); + error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, + SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 6555c50d900..0c24b9e24fe 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -153,6 +153,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct dentry *dentry = fhp->fh_dentry; int type; struct timespec time; + u32 f; type = (stat->mode & S_IFMT); @@ -173,10 +174,22 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, else *p++ = htonl(0xffffffff); *p++ = htonl((u32) stat->blocks); - if (is_fsid(fhp, rqstp->rq_reffh)) - *p++ = htonl((u32) fhp->fh_export->ex_fsid); - else + switch (fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: *p++ = htonl(new_encode_dev(stat->dev)); + break; + case FSIDSOURCE_FSID: + *p++ = htonl((u32) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u32*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; + *p++ = htonl(f); + break; + } *p++ = htonl((u32) stat->ino); *p++ = htonl((u32) stat->atime.tv_sec); *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8283236c6a0..7e6aa245b5d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -466,7 +466,10 @@ out: posix_acl_release(dpacl); return (error); out_nfserr: - error = nfserrno(host_error); + if (host_error == -EOPNOTSUPP) + error = nfserr_attrnotsupp; + else + error = nfserrno(host_error); goto out; } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index c577d8e1bd9..7659cc19299 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1921,7 +1921,7 @@ s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, u32 attr_len = 0; /* Silence stupid gcc warning. */ bool mp_rebuilt; -#ifdef NTFS_DEBUG +#ifdef DEBUG read_lock_irqsave(&ni->size_lock, flags); allocated_size = ni->allocated_size; read_unlock_irqrestore(&ni->size_lock, flags); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 076c9420c25..d69c4595ccd 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2328,7 +2328,7 @@ const struct file_operations ntfs_file_ops = { the data source. */ }; -struct inode_operations ntfs_file_inode_ops = { +const struct inode_operations ntfs_file_inode_ops = { #ifdef NTFS_RW .truncate = ntfs_truncate_vfs, .setattr = ntfs_setattr, @@ -2337,4 +2337,4 @@ struct inode_operations ntfs_file_inode_ops = { const struct file_operations ntfs_empty_file_ops = {}; -struct inode_operations ntfs_empty_inode_ops = {}; +const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index eddb2247cec..bff01a54675 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -359,7 +359,7 @@ err_out: /** * Inode operations for directories. */ -struct inode_operations ntfs_dir_inode_ops = { +const struct inode_operations ntfs_dir_inode_ops = { .lookup = ntfs_lookup, /* VFS: Lookup directory. */ }; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index a12847ae467..d73f5a9ac34 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -61,13 +61,13 @@ extern const struct address_space_operations ntfs_aops; extern const struct address_space_operations ntfs_mst_aops; extern const struct file_operations ntfs_file_ops; -extern struct inode_operations ntfs_file_inode_ops; +extern const struct inode_operations ntfs_file_inode_ops; extern const struct file_operations ntfs_dir_ops; -extern struct inode_operations ntfs_dir_inode_ops; +extern const struct inode_operations ntfs_dir_inode_ops; extern const struct file_operations ntfs_empty_file_ops; -extern struct inode_operations ntfs_empty_inode_ops; +extern const struct inode_operations ntfs_empty_inode_ops; extern struct export_operations ntfs_export_ops; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index babf94d90de..1594c90b716 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2699,7 +2699,7 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) /** * The complete super operations. */ -static struct super_operations ntfs_sops = { +static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ #ifdef NTFS_RW diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 1c23138d00b..4847fbfb010 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -33,20 +33,28 @@ #include "sysctl.h" #include "debug.h" -#define FS_NTFS 1 - /* Definition of the ntfs sysctl. */ static ctl_table ntfs_sysctls[] = { - { FS_NTFS, "ntfs-debug", /* Binary and text IDs. */ - &debug_msgs,sizeof(debug_msgs), /* Data pointer and size. */ - 0644, NULL, &proc_dointvec }, /* Mode, child, proc handler. */ - { 0 } + { + .ctl_name = CTL_UNNUMBERED, /* Binary and text IDs. */ + .procname = "ntfs-debug", + .data = &debug_msgs, /* Data pointer and size. */ + .maxlen = sizeof(debug_msgs), + .mode = 0644, /* Mode, proc handler. */ + .proc_handler = &proc_dointvec + }, + {} }; /* Define the parent directory /proc/sys/fs. */ static ctl_table sysctls_root[] = { - { CTL_FS, "fs", NULL, 0, 0555, ntfs_sysctls }, - { 0 } + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = ntfs_sysctls + }, + {} }; /* Storage for the sysctls header. */ @@ -62,17 +70,9 @@ int ntfs_sysctl(int add) { if (add) { BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl_table(sysctls_root, 0); + sysctls_root_table = register_sysctl_table(sysctls_root); if (!sysctls_root_table) return -ENOMEM; -#ifdef CONFIG_PROC_FS - /* - * If the proc filesystem is in use and we are a module, need - * to set the owner of our proc entry to our module. In the - * non-modular case, THIS_MODULE is NULL, so this is ok. - */ - ntfs_sysctls[0].de->owner = THIS_MODULE; -#endif } else { BUG_ON(!sysctls_root_table); unregister_sysctl_table(sysctls_root_table); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 277ca67a2ad..5a9779bb923 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg) flush_scheduled_work(); } -static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, - unsigned int num_ios) +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) { - atomic_set(&wc->wc_num_reqs, num_ios); + atomic_set(&wc->wc_num_reqs, 1); init_completion(&wc->wc_io_complete); wc->wc_error = 0; } @@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; blk_run_address_space(mapping); + o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); } @@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio, return 1; o2hb_bio_wait_dec(wc, 1); + bio_put(bio); return 0; } @@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio, * start_slot. */ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, - unsigned int start_slot, - unsigned int num_slots) + unsigned int *current_slot, + unsigned int max_slots) { - int i, nr_vecs, len, first_page, last_page; + int len, current_page; unsigned int vec_len, vec_start; unsigned int bits = reg->hr_block_bits; unsigned int spp = reg->hr_slots_per_page; + unsigned int cs = *current_slot; struct bio *bio; struct page *page; - nr_vecs = (num_slots + spp - 1) / spp; - /* Testing has shown this allocation to take long enough under * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, nr_vecs); + bio = bio_alloc(GFP_ATOMIC, 16); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); + bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - first_page = start_slot / spp; - last_page = first_page + nr_vecs; - vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; - for(i = first_page; i < last_page; i++) { - page = reg->hr_slot_data[i]; + vec_start = (cs << bits) % PAGE_CACHE_SIZE; + while(cs < max_slots) { + current_page = cs / spp; + page = reg->hr_slot_data[current_page]; - vec_len = PAGE_CACHE_SIZE; - /* last page might be short */ - if (((i + 1) * spp) > (start_slot + num_slots)) - vec_len = ((num_slots + start_slot) % spp) << bits; - vec_len -= vec_start; + vec_len = min(PAGE_CACHE_SIZE, + (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", - i, vec_len, vec_start); + current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - bio_put(bio); - bio = ERR_PTR(-EIO); - - mlog(ML_ERROR, "Error adding page to bio i = %d, " - "vec_len = %u, len = %d\n, start = %u\n", - i, vec_len, len, vec_start); - goto bail; - } + if (len != vec_len) break; + cs += vec_len / (PAGE_CACHE_SIZE/spp); vec_start = 0; } bail: + *current_slot = cs; return bio; } -/* - * Compute the maximum number of sectors the bdev can handle in one bio, - * as a power of two. - * - * Stolen from oracleasm, thanks Joel! - */ -static int compute_max_sectors(struct block_device *bdev) -{ - int max_pages, max_sectors, pow_two_sectors; - - struct request_queue *q; - - q = bdev_get_queue(bdev); - max_pages = q->max_sectors >> (PAGE_SHIFT - 9); - if (max_pages > BIO_MAX_PAGES) - max_pages = BIO_MAX_PAGES; - if (max_pages > q->max_phys_segments) - max_pages = q->max_phys_segments; - if (max_pages > q->max_hw_segments) - max_pages = q->max_hw_segments; - max_pages--; /* Handle I/Os that straddle a page */ - - if (max_pages) { - max_sectors = max_pages << (PAGE_SHIFT - 9); - } else { - /* If BIO contains 1 or less than 1 page. */ - max_sectors = q->max_sectors; - } - /* Why is fls() 1-based???? */ - pow_two_sectors = 1 << (fls(max_sectors) - 1); - - return pow_two_sectors; -} - -static inline void o2hb_compute_request_limits(struct o2hb_region *reg, - unsigned int num_slots, - unsigned int *num_bios, - unsigned int *slots_per_bio) -{ - unsigned int max_sectors, io_sectors; - - max_sectors = compute_max_sectors(reg->hr_bdev); - - io_sectors = num_slots << (reg->hr_block_bits - 9); - - *num_bios = (io_sectors + max_sectors - 1) / max_sectors; - *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); - - mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " - "device can handle %u sectors of I/O\n", io_sectors, num_slots, - max_sectors); - mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", - *num_bios, *slots_per_bio); -} - static int o2hb_read_slots(struct o2hb_region *reg, unsigned int max_slots) { - unsigned int num_bios, slots_per_bio, start_slot, num_slots; - int i, status; + unsigned int current_slot=0; + int status; struct o2hb_bio_wait_ctxt wc; - struct bio **bios; struct bio *bio; - o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); + o2hb_bio_wait_init(&wc); - bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); - if (!bios) { - status = -ENOMEM; - mlog_errno(status); - return status; - } - - o2hb_bio_wait_init(&wc, num_bios); - - num_slots = slots_per_bio; - for(i = 0; i < num_bios; i++) { - start_slot = i * slots_per_bio; - - /* adjust num_slots at last bio */ - if (max_slots < (start_slot + num_slots)) - num_slots = max_slots - start_slot; - - bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); + while(current_slot < max_slots) { + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); if (IS_ERR(bio)) { - o2hb_bio_wait_dec(&wc, num_bios - i); - status = PTR_ERR(bio); mlog_errno(status); goto bail_and_wait; } - bios[i] = bio; + atomic_inc(&wc.wc_num_reqs); submit_bio(READ, bio); } @@ -403,38 +319,30 @@ bail_and_wait: if (wc.wc_error && !status) status = wc.wc_error; - if (bios) { - for(i = 0; i < num_bios; i++) - if (bios[i]) - bio_put(bios[i]); - kfree(bios); - } - return status; } static int o2hb_issue_node_write(struct o2hb_region *reg, - struct bio **write_bio, struct o2hb_bio_wait_ctxt *write_wc) { int status; unsigned int slot; struct bio *bio; - o2hb_bio_wait_init(write_wc, 1); + o2hb_bio_wait_init(write_wc); slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); goto bail; } + atomic_inc(&write_wc->wc_num_reqs); submit_bio(WRITE, bio); - *write_bio = bio; status = 0; bail: return status; @@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* And fire off the write. Note that we don't wait on this I/O * until later. */ - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret < 0) { mlog_errno(ret); return ret; @@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * people we find in our steady state have seen us. */ o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -943,7 +849,6 @@ static int o2hb_thread(void *data) { int i, ret; struct o2hb_region *reg = data; - struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; struct timeval before_hb, after_hb; unsigned int elapsed_msec; @@ -993,10 +898,9 @@ static int o2hb_thread(void *data) * * XXX: Should we skip this on unclean_stop? */ o2hb_prepare_block(reg, 0); - ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) { o2hb_wait_on_io(reg, &write_wc); - bio_put(write_bio); } else { mlog_errno(ret); } diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index b17333a0606..9f5ad0f01ce 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -55,7 +55,7 @@ static ctl_table ocfs2_nm_table[] = { static ctl_table ocfs2_mod_table[] = { { - .ctl_name = KERN_OCFS2_NM, + .ctl_name = FS_OCFS2_NM, .procname = "nm", .data = NULL, .maxlen = 0, @@ -67,7 +67,7 @@ static ctl_table ocfs2_mod_table[] = { static ctl_table ocfs2_kern_table[] = { { - .ctl_name = KERN_OCFS2, + .ctl_name = FS_OCFS2, .procname = "ocfs2", .data = NULL, .maxlen = 0, @@ -922,7 +922,7 @@ static int __init init_o2nm(void) o2hb_init(); o2net_init(); - ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0); + ocfs2_table_header = register_sysctl_table(ocfs2_root_table); if (!ocfs2_table_header) { printk(KERN_ERR "nodemanager: unable to register sysctl\n"); ret = -ENOMEM; /* or something. */ diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 8fb23cacc2f..070522138ae 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,8 +33,7 @@ #include <linux/configfs.h> #include <linux/rbtree.h> -#define KERN_OCFS2 988 -#define KERN_OCFS2_NM 1 +#define FS_OCFS2_NM 1 const char *o2nm_get_hb_ctl_path(void); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ae4ff4a6636..1718215fc01 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -556,6 +556,8 @@ static void o2net_register_callbacks(struct sock *sk, sk->sk_data_ready = o2net_data_ready; sk->sk_state_change = o2net_state_change; + mutex_init(&sc->sc_send_lock); + write_unlock_bh(&sk->sk_callback_lock); } @@ -688,6 +690,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh) * be given to the handler if their payload is longer than the max. */ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list) { struct o2net_msg_handler *nmh = NULL; @@ -722,6 +725,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, nmh->nh_func = func; nmh->nh_func_data = data; + nmh->nh_post_func = post_func; nmh->nh_msg_type = msg_type; nmh->nh_max_len = max_len; nmh->nh_key = key; @@ -856,10 +860,12 @@ static void o2net_sendpage(struct o2net_sock_container *sc, ssize_t ret; + mutex_lock(&sc->sc_send_lock); ret = sc->sc_sock->ops->sendpage(sc->sc_sock, virt_to_page(kmalloced_virt), (long)kmalloced_virt & ~PAGE_MASK, size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); if (ret != size) { mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); @@ -974,8 +980,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, /* finally, convert the message header to network byte-order * and send */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, sizeof(struct o2net_msg) + caller_bytes); + mutex_unlock(&sc->sc_send_lock); msglog(msg, "sending returned %d\n", ret); if (ret < 0) { mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); @@ -1049,6 +1057,7 @@ static int o2net_process_message(struct o2net_sock_container *sc, int ret = 0, handler_status; enum o2net_system_error syserr; struct o2net_msg_handler *nmh = NULL; + void *ret_data = NULL; msglog(hdr, "processing message\n"); @@ -1101,17 +1110,26 @@ static int o2net_process_message(struct o2net_sock_container *sc, sc->sc_msg_type = be16_to_cpu(hdr->msg_type); handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len), - nmh->nh_func_data); + nmh->nh_func_data, &ret_data); do_gettimeofday(&sc->sc_tv_func_stop); out_respond: /* this destroys the hdr, so don't use it after this */ + mutex_lock(&sc->sc_send_lock); ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, handler_status); + mutex_unlock(&sc->sc_send_lock); hdr = NULL; mlog(0, "sending handler status %d, syserr %d returned %d\n", handler_status, syserr, ret); + if (nmh) { + BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); + if (nmh->nh_post_func) + (nmh->nh_post_func)(handler_status, nmh->nh_func_data, + ret_data); + } + out: if (nmh) o2net_handler_put(nmh); @@ -1795,13 +1813,13 @@ out: ready(sk, bytes); } -static int o2net_open_listening_sock(__be16 port) +static int o2net_open_listening_sock(__be32 addr, __be16 port) { struct socket *sock = NULL; int ret; struct sockaddr_in sin = { .sin_family = PF_INET, - .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, + .sin_addr = { .s_addr = (__force u32)addr }, .sin_port = (__force u16)port, }; @@ -1824,15 +1842,15 @@ static int o2net_open_listening_sock(__be16 port) sock->sk->sk_reuse = 1; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { - mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, " + "ret=%d\n", NIPQUAD(addr), ntohs(port), ret); goto out; } ret = sock->ops->listen(sock, 64); if (ret < 0) { - mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", - ntohs(port), ret); + mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n", + NIPQUAD(addr), ntohs(port), ret); } out: @@ -1865,7 +1883,8 @@ int o2net_start_listening(struct o2nm_node *node) return -ENOMEM; /* ? */ } - ret = o2net_open_listening_sock(node->nd_ipv4_port); + ret = o2net_open_listening_sock(node->nd_ipv4_address, + node->nd_ipv4_port); if (ret) { destroy_workqueue(o2net_wq); o2net_wq = NULL; diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 21a4e43df83..da880fc215f 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -50,7 +50,10 @@ struct o2net_msg __u8 buf[0]; }; -typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +typedef void (o2net_post_msg_handler_func)(int status, void *data, + void *ret_data); #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) @@ -99,6 +102,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, + o2net_post_msg_handler_func *post_func, struct list_head *unreg_list); void o2net_unregister_handler_list(struct list_head *list); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b700dc9624d..4dae5df5e46 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,12 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 7: + * - DLM join domain includes the live nodemap + * + * New in version 6: + * - DLM lockres remote refcount fixes. + * * New in version 5: * - Network timeout checking protocol * @@ -51,7 +57,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 5ULL +#define O2NET_PROTOCOL_VERSION 7ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; @@ -149,6 +155,8 @@ struct o2net_sock_container { struct timeval sc_tv_func_stop; u32 sc_msg_key; u16 sc_msg_type; + + struct mutex sc_send_lock; }; struct o2net_msg_handler { @@ -158,6 +166,8 @@ struct o2net_msg_handler { u32 nh_key; o2net_msg_handler_func *nh_func; o2net_msg_handler_func *nh_func_data; + o2net_post_msg_handler_func + *nh_post_func; struct kref nh_kref; struct list_head nh_unregister_item; }; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 681046d5139..241cad342a4 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -263,7 +263,8 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { int ret; unsigned int locklen; @@ -311,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" "name=%.*s\n", past->type, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name); ret = DLM_IVLOCKID; goto leave; @@ -323,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lockres! " "cookie=%u:%llu, name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_IVLOCKID; goto leave; @@ -369,7 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, locklen); ret = DLM_NORMAL; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 6b6ff76538c..e90b92f9ece 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -180,6 +180,11 @@ struct dlm_assert_master_priv unsigned ignore_higher:1; }; +struct dlm_deref_lockres_priv +{ + struct dlm_lock_resource *deref_res; + u8 deref_node; +}; struct dlm_work_item { @@ -191,6 +196,7 @@ struct dlm_work_item struct dlm_request_all_locks_priv ral; struct dlm_mig_lockres_priv ml; struct dlm_assert_master_priv am; + struct dlm_deref_lockres_priv dl; } u; }; @@ -222,6 +228,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DIRTY 0x00000008 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 +#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 +#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -265,6 +274,8 @@ struct dlm_lock_resource u8 owner; //node which owns the lock resource, or unknown u16 state; char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; struct dlm_migratable_lock @@ -367,7 +378,7 @@ enum { DLM_CONVERT_LOCK_MSG, /* 504 */ DLM_PROXY_AST_MSG, /* 505 */ DLM_UNLOCK_LOCK_MSG, /* 506 */ - DLM_UNUSED_MSG2, /* 507 */ + DLM_DEREF_LOCKRES_MSG, /* 507 */ DLM_MIGRATE_REQUEST_MSG, /* 508 */ DLM_MIG_LOCKRES_MSG, /* 509 */ DLM_QUERY_JOIN_MSG, /* 510 */ @@ -417,6 +428,9 @@ struct dlm_master_request u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 #define DLM_ASSERT_MASTER_REQUERY 0x00000002 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 @@ -430,6 +444,8 @@ struct dlm_assert_master u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + struct dlm_migrate_request { u8 master; @@ -609,12 +625,16 @@ struct dlm_begin_reco }; +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + struct dlm_query_join_request { u8 node_idx; u8 pad1[2]; u8 name_len; u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; }; struct dlm_assert_joined @@ -648,6 +668,16 @@ struct dlm_finalize_reco __be32 pad2; }; +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -688,16 +718,20 @@ void dlm_lock_put(struct dlm_lock *lock); void dlm_lock_attach_lockres(struct dlm_lock *lock, struct dlm_lock_resource *res); -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_revert_pending_lock(struct dlm_lock_resource *res, struct dlm_lock *lock); -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock); void dlm_commit_pending_unlock(struct dlm_lock_resource *res, @@ -721,8 +755,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -void dlm_purge_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); static inline void dlm_lockres_get(struct dlm_lock_resource *res) { /* This is called on every lookup, so it might be worth @@ -733,6 +765,10 @@ void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -753,6 +789,47 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen); +#define dlm_lockres_set_refmap_bit(bit,res) \ + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) +#define dlm_lockres_clear_refmap_bit(bit,res) \ + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) + +static inline void __dlm_lockres_set_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: setting bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + set_bit(bit, res->refmap); +} + +static inline void __dlm_lockres_clear_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + clear_bit(bit, res->refmap); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line); +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line); +#define dlm_lockres_drop_inflight_ref(d,r) \ + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref_new(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, @@ -801,10 +878,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_migrate_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - u8 target); +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master); @@ -812,15 +886,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +void dlm_assert_master_post_handler(int status, void *data, void *ret_data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); @@ -856,10 +942,12 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) int dlm_init_mle_cache(void); void dlm_destroy_mle_cache(void); void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); - +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index c764dc8e40a..ecb4d997221 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -286,8 +286,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, __dlm_print_one_lock_resource(res); mlog(ML_ERROR, "converting a remote lock that is already " "converting! (cookie=%u:%llu, conv=%d)\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), lock->ml.convert_type); status = DLM_DENIED; goto bail; @@ -418,7 +418,8 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, * status from __dlmconvert_master */ -int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; @@ -428,7 +429,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0, ast_reserved = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -479,25 +480,14 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } - if (!lock) { - __dlm_print_one_lock_resource(res); - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.node == cnv->node_idx) { - mlog(ML_ERROR, "There is something here " - "for node %u, lock->ml.cookie=%llu, " - "cnv->cookie=%llu\n", cnv->node_idx, - (unsigned long long)lock->ml.cookie, - (unsigned long long)cnv->cookie); - break; - } - } - lock = NULL; - } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; - dlm_error(status); + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%u:%llu\n", + dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie))); + __dlm_print_one_lock_resource(res); goto leave; } @@ -524,8 +514,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) cnv->requested_type, &call_ast, &kick_thread); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + wake = 1; } spin_unlock(&res->spinlock); + if (wake) + wake_up(&res->wq); if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) @@ -534,12 +527,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } leave: - if (!lock) - mlog(ML_ERROR, "did not find lock to convert on grant queue! " - "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(cnv->cookie), - dlm_get_lock_cookie_seq(cnv->cookie)); - else + if (lock) dlm_lock_put(lock); /* either queue the ast or release it, if reserved */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 3f6c8d88f7a..64239b37e5d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct dlm_lock_resource *res) spin_unlock(&res->spinlock); } +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + mlog(ML_NOTICE, " refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { struct list_head *iter2; @@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) res->owner, res->state); mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", res->last_used, list_empty(&res->purge) ? "no" : "yes"); + dlm_print_lockres_refmap(res); mlog(ML_NOTICE, " granted queue: \n"); list_for_each(iter2, &res->granted) { lock = list_entry(iter2, struct dlm_lock, list); @@ -72,8 +90,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -87,8 +105,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', @@ -102,8 +120,8 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", lock->ml.type, lock->ml.convert_type, lock->ml.node, - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), list_empty(&lock->ast_list) ? 'y' : 'n', lock->ast_pending ? 'y' : 'n', list_empty(&lock->bast_list) ? 'y' : 'n', diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index f0b25f2dd20..6087c4749fe 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -48,6 +48,36 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + static void dlm_free_pagevec(void **vec, int pages) { while (pages--) @@ -95,10 +125,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); #define DLM_DOMAIN_BACKOFF_MS 200 -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); @@ -125,10 +159,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, hlist_add_head(&res->hash_node, bucket); } -struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len, - unsigned int hash) +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) { struct hlist_head *bucket; struct hlist_node *list; @@ -154,6 +188,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, return NULL; } +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog_entry("%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len) @@ -330,43 +395,60 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) wake_up(&dlm_domain_events); } -static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) { - int i; + int i, num, n, ret = 0; struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; mlog(0, "Migrating locks from domain %s\n", dlm->name); -restart: + + num = 0; spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(dlm_lockres_hash(dlm, i))) { - res = hlist_entry(dlm_lockres_hash(dlm, i)->first, - struct dlm_lock_resource, hash_node); - /* need reference when manually grabbing lockres */ +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); dlm_lockres_get(res); - /* this should unhash the lockres - * and exit with dlm->spinlock */ - mlog(0, "purging res=%p\n", res); - if (dlm_lockres_is_dirty(dlm, res)) { - /* HACK! this should absolutely go. - * need to figure out why some empty - * lockreses are still marked dirty */ - mlog(ML_ERROR, "lockres %.*s dirty!\n", - res->lockname.len, res->lockname.name); - - spin_unlock(&dlm->spinlock); - dlm_kick_thread(dlm, res); - wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); - dlm_lockres_put(res); - goto restart; - } - dlm_purge_lockres(dlm, res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + __dlm_lockres_calc_usage(dlm, res); + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + cond_resched_lock(&dlm->spinlock); + + if (dropped) + goto redo_bucket; } + num += n; + mlog(0, "%s: touched %d lockreses in bucket %d " + "(tot=%d)\n", dlm->name, n, i, num); } spin_unlock(&dlm->spinlock); - + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; } static int dlm_no_joining_node(struct dlm_ctxt *dlm) @@ -418,7 +500,8 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm) printk("\n"); } -static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; unsigned int node; @@ -571,7 +654,9 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); - dlm_migrate_all_locks(dlm); + while (dlm_migrate_all_locks(dlm)) { + mlog(0, "%s: more migration to do\n", dlm->name); + } dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); dlm_complete_dlm_shutdown(dlm); @@ -580,11 +665,13 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) } EXPORT_SYMBOL_GPL(dlm_unregister_domain); -static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_query_join_request *query; enum dlm_query_join_response response; struct dlm_ctxt *dlm = NULL; + u8 nodenum; query = (struct dlm_query_join_request *) msg->buf; @@ -608,6 +695,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + mlog(0, "disallow join as node %u does not " + "have node %u in its nodemap\n", + query->node_idx, nodenum); + response = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + /* Once the dlm ctxt is marked as leaving then we don't want * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome @@ -626,15 +735,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) /* Disallow parallel joins. */ response = JOIN_DISALLOW; } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { - mlog(ML_NOTICE, "node %u trying to join, but recovery " + mlog(0, "node %u trying to join, but recovery " "is ongoing.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->recovery_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "still needs recovery.\n", bit); response = JOIN_DISALLOW; } else if (test_bit(bit, dlm->domain_map)) { - mlog(ML_NOTICE, "node %u trying to join, but it " + mlog(0, "node %u trying to join, but it " "is still in the domain! needs recovery?\n", bit); response = JOIN_DISALLOW; @@ -649,6 +758,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); } +unlock_respond: spin_unlock(&dlm_domain_lock); respond: @@ -657,7 +767,8 @@ respond: return response; } -static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_assert_joined *assert; struct dlm_ctxt *dlm = NULL; @@ -694,7 +805,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) return 0; } -static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_cancel_join *cancel; struct dlm_ctxt *dlm = NULL; @@ -796,6 +908,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm, join_msg.name_len = strlen(dlm->name); memcpy(join_msg.domain, dlm->name, join_msg.name_len); + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, sizeof(join_msg), node, &retval); if (status < 0 && status != -ENOPROTOOPT) { @@ -1036,98 +1151,106 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, sizeof(struct dlm_master_request), dlm_master_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, sizeof(struct dlm_assert_master), dlm_assert_master_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, dlm_assert_master_post_handler, + &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, sizeof(struct dlm_create_lock), dlm_create_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, DLM_CONVERT_LOCK_MAX_LEN, dlm_convert_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, DLM_UNLOCK_LOCK_MAX_LEN, dlm_unlock_lock_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, DLM_PROXY_AST_MAX_LEN, dlm_proxy_ast_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, sizeof(struct dlm_exit_domain), dlm_exit_domain_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, sizeof(struct dlm_migrate_request), dlm_migrate_request_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, DLM_MIG_LOCKRES_MAX_LEN, dlm_mig_lockres_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, sizeof(struct dlm_master_requery), dlm_master_requery_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, sizeof(struct dlm_lock_request), dlm_request_all_locks_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, sizeof(struct dlm_reco_data_done), dlm_reco_data_done_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, sizeof(struct dlm_begin_reco), dlm_begin_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, sizeof(struct dlm_finalize_reco), dlm_finalize_reco_handler, - dlm, &dlm->dlm_domain_handlers); + dlm, NULL, &dlm->dlm_domain_handlers); if (status) goto bail; @@ -1141,6 +1264,8 @@ bail: static int dlm_join_domain(struct dlm_ctxt *dlm) { int status; + unsigned int backoff; + unsigned int total_backoff = 0; BUG_ON(!dlm); @@ -1172,18 +1297,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } do { - unsigned int backoff; status = dlm_try_to_join_domain(dlm); /* If we're racing another node to the join, then we * need to back off temporarily and let them * complete. */ +#define DLM_JOIN_TIMEOUT_MSECS 90000 if (status == -EAGAIN) { if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } + if (total_backoff > + msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + status = -ERESTARTSYS; + mlog(ML_NOTICE, "Timed out joining dlm domain " + "%s after %u msecs\n", dlm->name, + jiffies_to_msecs(total_backoff)); + goto bail; + } + /* * <chip> After you! * <dale> No, after you! @@ -1193,6 +1327,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) */ backoff = (unsigned int)(jiffies & 0x3); backoff *= DLM_DOMAIN_BACKOFF_MS; + total_backoff += backoff; mlog(0, "backoff %d\n", backoff); msleep(backoff); } @@ -1421,21 +1556,21 @@ static int dlm_register_net_handlers(void) status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_query_join_request), dlm_query_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, sizeof(struct dlm_assert_joined), dlm_assert_joined_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); if (status) goto bail; status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, sizeof(struct dlm_cancel_join), dlm_cancel_join_handler, - NULL, &dlm_join_handlers); + NULL, NULL, &dlm_join_handlers); bail: if (status < 0) diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index b7f0ba97a1a..de952eba29a 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -61,11 +61,11 @@ #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" -static struct super_operations dlmfs_ops; -static struct file_operations dlmfs_file_operations; -static struct inode_operations dlmfs_dir_inode_operations; -static struct inode_operations dlmfs_root_inode_operations; -static struct inode_operations dlmfs_file_inode_operations; +static const struct super_operations dlmfs_ops; +static const struct file_operations dlmfs_file_operations; +static const struct inode_operations dlmfs_dir_inode_operations; +static const struct inode_operations dlmfs_root_inode_operations; +static const struct inode_operations dlmfs_file_inode_operations; static struct kmem_cache *dlmfs_inode_cache; struct workqueue_struct *user_dlm_worker; @@ -540,27 +540,27 @@ static int dlmfs_fill_super(struct super_block * sb, return 0; } -static struct file_operations dlmfs_file_operations = { +static const struct file_operations dlmfs_file_operations = { .open = dlmfs_file_open, .release = dlmfs_file_release, .read = dlmfs_file_read, .write = dlmfs_file_write, }; -static struct inode_operations dlmfs_dir_inode_operations = { +static const struct inode_operations dlmfs_dir_inode_operations = { .create = dlmfs_create, .lookup = simple_lookup, .unlink = dlmfs_unlink, }; /* this way we can restrict mkdir to only the toplevel of the fs. */ -static struct inode_operations dlmfs_root_inode_operations = { +static const struct inode_operations dlmfs_root_inode_operations = { .lookup = simple_lookup, .mkdir = dlmfs_mkdir, .rmdir = simple_rmdir, }; -static struct super_operations dlmfs_ops = { +static const struct super_operations dlmfs_ops = { .statfs = simple_statfs, .alloc_inode = dlmfs_alloc_inode, .destroy_inode = dlmfs_destroy_inode, @@ -568,7 +568,7 @@ static struct super_operations dlmfs_ops = { .drop_inode = generic_delete_inode, }; -static struct inode_operations dlmfs_file_inode_operations = { +static const struct inode_operations dlmfs_file_inode_operations = { .getattr = simple_getattr, }; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index e5ca3db197f..52578d907d9 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, kick_thread = 1; } } + /* reduce the inflight count, this may result in the lockres + * being purged below during calc_usage */ + if (lock->ml.node == dlm->node_num) + dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -437,7 +441,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, * held on exit: none * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED */ -int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 0ad872055cb..77e4e6169a0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, int idx); static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); static inline int dlm_mle_equal(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry **mle, char *name, unsigned int namelen); -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, @@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, INIT_LIST_HEAD(&res->purge); atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; + res->inflight_locks = 0; kref_init(&res->refs); @@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; } +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line) +{ + if (!new_lockres) + assert_spin_locked(&res->spinlock); + + if (!test_bit(dlm->node_num, res->refmap)) { + BUG_ON(res->inflight_locks != 0); + dlm_lockres_set_refmap_bit(dlm->node_num, res); + } + res->inflight_locks++; + mlog(0, "%s:%.*s: inflight++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; + mlog(0, "%s:%.*s: inflight--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); + if (res->inflight_locks == 0) + dlm_lockres_clear_refmap_bit(dlm->node_num, res); + wake_up(&res->wq); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; + int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { + int dropping_ref = 0; + + spin_lock(&tmpres->spinlock); + if (tmpres->owner == dlm->node_num) { + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); + dlm_lockres_grab_inflight_ref(dlm, tmpres); + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) + dropping_ref = 1; + spin_unlock(&tmpres->spinlock); spin_unlock(&dlm->spinlock); + + /* wait until done messaging the master, drop our ref to allow + * the lockres to be purged, start over. */ + if (dropping_ref) { + spin_lock(&tmpres->spinlock); + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + mlog(0, "found in hash!\n"); if (res) dlm_lockres_put(res); @@ -793,6 +855,7 @@ lookup: spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, dlm->node_num); __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* lockres still marked IN_PROGRESS */ @@ -805,29 +868,40 @@ lookup: /* if we found a block, wait for lock to be mastered by another node */ blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); if (blocked) { + int mig; if (mle->type == DLM_MLE_MASTER) { mlog(ML_ERROR, "master entry for nonexistent lock!\n"); BUG(); - } else if (mle->type == DLM_MLE_MIGRATION) { - /* migration is in progress! */ - /* the good news is that we now know the - * "current" master (mle->master). */ - + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); spin_unlock(&dlm->master_lock); - assert_spin_locked(&dlm->spinlock); - - /* set the lockres owner and hash it */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, mle->master); - __dlm_insert_lockres(dlm, res); - spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* master is known, detach */ - dlm_mle_detach_hb_events(dlm, mle); + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); mle = NULL; - goto wake_waiters; + /* this is lame, but we cant wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; } } else { /* go ahead and try to master lock on this node */ @@ -858,6 +932,13 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); + /* since this lockres is new it doesnt not require the spinlock */ + dlm_lockres_grab_inflight_ref_new(dlm, res); + + /* if this node does not become the master make sure to drop + * this inflight reference below */ + drop_inflight_if_nonlocal = 1; + /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we @@ -910,7 +991,7 @@ redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { - ret = dlm_do_master_request(mle, nodenum); + ret = dlm_do_master_request(res, mle, nodenum); if (ret < 0) mlog_errno(ret); if (mle->master != O2NM_MAX_NODES) { @@ -960,6 +1041,8 @@ wait: wake_waiters: spin_lock(&res->spinlock); + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) + dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -998,7 +1081,7 @@ recheck: /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ if (res->owner != dlm->node_num) { - ret = dlm_do_master_request(mle, res->owner); + ret = dlm_do_master_request(res, mle, res->owner); if (ret < 0) { /* give recovery a chance to run */ mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); @@ -1062,6 +1145,8 @@ recheck: * now tell other nodes that I am * mastering this. */ mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ assert = 1; sleep = 0; } @@ -1087,7 +1172,8 @@ recheck: (atomic_read(&mle->woken) == 1), timeo); if (res->owner == O2NM_MAX_NODES) { - mlog(0, "waiting again\n"); + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); goto recheck; } mlog(0, "done waiting, master is %u\n", res->owner); @@ -1100,8 +1186,7 @@ recheck: m = dlm->node_num; mlog(0, "about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, m); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, mle->vote_map, 0); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); if (ret) { /* This is a failure in the network path, * not in the response to the assert_master @@ -1117,6 +1202,8 @@ recheck: /* set the lockres owner */ spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ dlm_change_lockres_owner(dlm, res, m); spin_unlock(&res->spinlock); @@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, * */ -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) { struct dlm_ctxt *dlm = mle->dlm; struct dlm_master_request request; @@ -1339,6 +1427,9 @@ again: case DLM_MASTER_RESP_YES: set_bit(to, mle->response_map); mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); mle->master = to; break; case DLM_MASTER_RESP_NO: @@ -1379,7 +1470,8 @@ out: * * if possible, TRIM THIS DOWN!!! */ -int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { u8 response = DLM_MASTER_RESP_MAYBE; struct dlm_ctxt *dlm = data; @@ -1417,10 +1509,11 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) { + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " - "being recovered\n"); + "being recovered/migrated\n"); response = DLM_MASTER_RESP_ERROR; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1428,8 +1521,10 @@ way_up_top: } if (res->owner == dlm->node_num) { + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, request->node_idx); + dlm_lockres_set_refmap_bit(request->node_idx, res); spin_unlock(&res->spinlock); - // mlog(0, "this node is the master\n"); response = DLM_MASTER_RESP_YES; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1477,7 +1572,6 @@ way_up_top: mlog(0, "node %u is master, but trying to migrate to " "node %u.\n", tmpmle->master, tmpmle->new_master); if (tmpmle->master == dlm->node_num) { - response = DLM_MASTER_RESP_YES; mlog(ML_ERROR, "no owner on lockres, but this " "node is trying to migrate it to %u?!\n", tmpmle->new_master); @@ -1494,6 +1588,10 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; + dlm_lockres_set_refmap_bit(request->node_idx, res); + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1607,17 +1705,24 @@ send_response: * can periodically run all locks owned by this node * and re-assert across the cluster... */ -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags) +int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) { struct dlm_assert_master assert; int to, tmpret; struct dlm_node_iter iter; int ret = 0; int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + again: reassert = 0; @@ -1647,6 +1752,7 @@ again: mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; + r = 0; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " @@ -1661,17 +1767,39 @@ again: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); BUG(); - } else if (r == EAGAIN) { + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { mlog(0, "%.*s: node %u create mles on other " "nodes and requests a re-assert\n", namelen, lockname, to); reassert = 1; } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(to, res); + spin_unlock(&res->spinlock); + } } if (reassert) goto again; + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + return ret; } @@ -1684,7 +1812,8 @@ again: * * if possible, TRIM THIS DOWN!!! */ -int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_list_entry *mle = NULL; @@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) char *name; unsigned int namelen, hash; u32 flags; - int master_request = 0; + int master_request = 0, have_lockres_ref = 0; int ret = 0; if (!dlm_grab(dlm)) @@ -1851,6 +1980,7 @@ ok: spin_unlock(&mle->spinlock); if (res) { + int wake = 0; spin_lock(&res->spinlock); if (mle->type == DLM_MLE_MIGRATION) { mlog(0, "finishing off migration of lockres %.*s, " @@ -1858,12 +1988,16 @@ ok: res->lockname.len, res->lockname.name, dlm->node_num, mle->new_master); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; dlm_change_lockres_owner(dlm, res, mle->new_master); BUG_ON(res->state & DLM_LOCK_RES_DIRTY); } else { dlm_change_lockres_owner(dlm, res, mle->master); } spin_unlock(&res->spinlock); + have_lockres_ref = 1; + if (wake) + wake_up(&res->wq); } /* master is known, detach if not already detached. @@ -1913,12 +2047,28 @@ ok: done: ret = 0; - if (res) - dlm_lockres_put(res); + if (res) { + spin_lock(&res->spinlock); + res->state |= DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + *ret_data = (void *)res; + } dlm_put(dlm); if (master_request) { mlog(0, "need to tell master to reassert\n"); - ret = EAGAIN; // positive. negative would shoot down the node. + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); } return ret; @@ -1929,11 +2079,25 @@ kill: __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); - dlm_lockres_put(res); + *ret_data = (void *)res; dlm_put(dlm); return -EINVAL; } +void dlm_assert_master_post_handler(int status, void *data, void *ret_data) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; + + if (ret_data) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_SETREF_INPROG; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + } + return; +} + int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, int ignore_higher, u8 request_from, u32 flags) @@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, dlm->node_num); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, - nodemap, flags); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ if (!dlm_is_host_down(ret)) @@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, return ret; } +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + mlog(0, "%s:%.*s: sending deref to %d\n", + dlm->name, namelen, lockname, res->owner); + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog_errno(ret); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR,"while dropping ref on %s:%.*s " + "(master=%u) got %d.\n", dlm->name, namelen, + lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + struct dlm_work_item *item; + int cleared = 0; + int dispatch = 0; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_SETREF_INPROG) + dispatch = 1; + else { + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + } + spin_unlock(&res->spinlock); + + if (!dispatch) { + if (cleared) + dlm_lockres_calc_usage(dlm, res); + else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + ret = 0; + goto done; + } + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (!item) { + ret = -ENOMEM; + mlog_errno(ret); + goto done; + } + + dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); + item->u.dl.deref_res = res; + item->u.dl.deref_node = node; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + queue_work(dlm->dlm_worker, &dlm->dispatched_work); + return 0; + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + + return ret; +} + +static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm; + struct dlm_lock_resource *res; + u8 node; + u8 cleared = 0; + + dlm = item->dlm; + res = item->u.dl.deref_res; + node = item->u.dl.deref_node; + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + dlm_lockres_clear_refmap_bit(node, res); + cleared = 1; + } + spin_unlock(&res->spinlock); + + if (cleared) { + mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", + dlm->name, res->lockname.len, res->lockname.name, node); + dlm_lockres_calc_usage(dlm, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + __dlm_print_one_lock_resource(res); + } + + dlm_lockres_put(res); +} + /* * DLM_MIGRATE_LOCKRES */ -int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - u8 target) +static int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) { struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; @@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct list_head *queue, *iter; int i; struct dlm_lock *lock; - int empty = 1; + int empty = 1, wake = 0; if (!dlm_grab(dlm)) return -EINVAL; @@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.name, target); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); ret = -EINVAL; } @@ -2268,6 +2597,9 @@ fail: * the lockres */ + /* now that remote nodes are spinning on the MIGRATING flag, + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); /* get an extra reference on the mle. * otherwise the assert_master from the new @@ -2296,6 +2628,7 @@ fail: dlm_put_mle_inuse(mle); spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; spin_unlock(&res->spinlock); goto leave; } @@ -2322,7 +2655,8 @@ fail: res->owner == target) break; - mlog(0, "timed out during migration\n"); + mlog(0, "%s:%.*s: timed out during migration\n", + dlm->name, res->lockname.len, res->lockname.name); /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { @@ -2330,20 +2664,20 @@ fail: "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); - ret = -ERESTARTSYS; + ret = -EINVAL; + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + wake = 1; + spin_unlock(&res->spinlock); + goto leave; } - } - if (ret == -ERESTARTSYS) { - /* migration failed, detach and clean up mle */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - dlm_put_mle_inuse(mle); - spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - spin_unlock(&res->spinlock); - goto leave; - } - /* TODO: if node died: stop, clean up, return error */ + } else + mlog(0, "%s:%.*s: caught signal during migration\n", + dlm->name, res->lockname.len, res->lockname.name); } /* all done, set the owner, clear the flag */ @@ -2366,6 +2700,11 @@ leave: if (ret < 0) dlm_kick_thread(dlm, res); + /* wake up waiters if the MIGRATING flag got set + * but migration failed */ + if (wake) + wake_up(&res->wq); + /* TODO: cleanup */ if (mres) free_page((unsigned long)mres); @@ -2376,6 +2715,53 @@ leave: return ret; } +#define DLM_MIGRATION_RETRY_MS 100 + +/* Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + + if (res->owner != dlm->node_num) { + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s:%.*s: this node is not master, " + "trying to free this but locks remain\n", + dlm->name, res->lockname.len, res->lockname.name); + } + goto leave; + } + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + while (1) { + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); + if (ret >= 0) + break; + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + res->lockname.len, res->lockname.name); + BUG(); + } + + mlog(0, "lockres %.*s: migrate failed, " + "retrying\n", res->lockname.len, + res->lockname.name); + msleep(DLM_MIGRATION_RETRY_MS); + } + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { int ret; @@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, return can_proceed; } -int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int ret; spin_lock(&res->spinlock); @@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, __dlm_lockres_reserve_ast(res); spin_unlock(&res->spinlock); - /* now flush all the pending asts.. hang out for a bit */ + /* now flush all the pending asts */ dlm_kick_thread(dlm, res); + /* before waiting on DIRTY, block processes which may + * try to dirty the lockres before MIGRATING is set */ + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); + res->state |= DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); + /* now wait on any pending asts and the DIRTY state */ wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); dlm_lockres_release_ast(dlm, res); @@ -2461,6 +2855,13 @@ again: mlog(0, "trying again...\n"); goto again; } + /* now that we are sure the MIGRATING state is there, drop + * the unneded state which blocked threads trying to DIRTY */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + spin_unlock(&res->spinlock); /* did the target go down or die? */ spin_lock(&dlm->spinlock); @@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *iter2; struct list_head *queue = &res->granted; - int i; + int i, bit; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(lock->ml.node, res); list_del_init(&lock->list); dlm_lock_put(lock); } } queue++; } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(bit, res); + } + bit++; + } } /* for now this is not too intelligent. we will @@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(nodenum, res); + spin_unlock(&res->spinlock); } } @@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, * we will have no mle in the list to start with. now we can add an mle for * the migration and this should be the only one found for those scanning the * list. */ -int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_resource *res = NULL; @@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); - __dlm_mle_detach_hb_events(dlm, mle); + /* this was obviously WRONG. mle is uninited here. should be tmp. */ + __dlm_mle_detach_hb_events(dlm, tmp); + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref for cleared out mle " + "during migration\n", dlm->name, namelen, name, + master, new_master); } spin_unlock(&tmp->spinlock); } @@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* now add a migration mle to the tail of the list */ dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); @@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, clear_bit(dlm->node_num, iter.node_map); spin_unlock(&dlm->spinlock); + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(old_master, res); + spin_unlock(&res->spinlock); + mlog(0, "now time to do a migrate request to other nodes\n"); ret = dlm_do_migrate_request(dlm, res, old_master, dlm->node_num, &iter); @@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, res->lockname.len, res->lockname.name); /* this call now finishes out the nodemap * even if one or more nodes die */ - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { /* no longer need to retry. all living nodes contacted. */ @@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { mlog(0, "assert master to original master failed " diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 367a11e9e2e..6d4a83d5015 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -163,9 +163,6 @@ void dlm_dispatch_work(struct work_struct *work) dlm_workfunc_t *workfunc; int tot=0; - if (!dlm_joined(dlm)) - return; - spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); @@ -821,7 +818,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, } -int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; @@ -978,7 +976,8 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) } -int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; @@ -1129,6 +1128,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (total_locks == mres_total_locks) mres->flags |= DLM_MRES_ALL_DONE; + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + send_to); + /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, sz, send_to, &status); @@ -1213,6 +1217,34 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, return 0; } +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_migratable_lockres *mres, @@ -1260,6 +1292,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, goto error; } } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); if (ret < 0) @@ -1293,7 +1333,8 @@ error: * do we spin? returning an error only delays the problem really */ -int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_migratable_lockres *mres = @@ -1382,17 +1423,21 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); + wake_up(&res->wq); /* add an extra ref for just-allocated lockres * otherwise the lockres will be purged immediately */ dlm_lockres_get(res); - } /* at this point we have allocated everything we need, * and we have a hashed lockres with an extra ref and * the proper res->state flags. */ ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { /* migration cannot have an unknown master */ BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); @@ -1400,10 +1445,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) "unknown owner.. will need to requery: " "%.*s\n", mres->lockname_len, mres->lockname); } else { - spin_lock(&res->spinlock); + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ dlm_change_lockres_owner(dlm, res, dlm->node_num); - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* queue up work for dlm_mig_lockres_worker */ dlm_grab(dlm); /* get an extra ref for the work item */ @@ -1459,6 +1505,9 @@ again: "this node will take it.\n", res->lockname.len, res->lockname.name); } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); mlog(0, "master needs to respond to sender " "that node %u still owns %.*s\n", real_master, res->lockname.len, @@ -1578,7 +1627,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, /* this function cannot error, so unless the sending * or receiving of the message failed, the owner can * be trusted */ -int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; @@ -1660,21 +1710,38 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, { struct dlm_migratable_lock *ml; struct list_head *queue; + struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i, bad; + int i, j, bad; struct list_head *iter; struct dlm_lock *lock = NULL; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; mlog(0, "running %d locks for this lockres\n", mres->num_locks); for (i=0; i<mres->num_locks; i++) { ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(from, res); + spin_unlock(&res->spinlock); + added++; + break; + } BUG_ON(ml->highest_blocked != LKM_IVMODE); newlock = NULL; lksb = NULL; queue = dlm_list_num_to_pointer(res, ml->list); + tmpq = NULL; /* if the lock is for the local node it needs to * be moved to the proper location within the queue. @@ -1684,11 +1751,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); spin_lock(&res->spinlock); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { + tmpq = dlm_list_idx_to_ptr(res, j); + list_for_each(iter, tmpq) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie != ml->cookie) + lock = NULL; + else + break; + } + if (lock) break; } @@ -1698,12 +1770,20 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, u64 c = ml->cookie; mlog(ML_ERROR, "could not find local lock " "with cookie %u:%llu!\n", - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); + __dlm_print_one_lock_resource(res); BUG(); } BUG_ON(lock->ml.node != ml->node); + if (tmpq != queue) { + mlog(0, "lock was on %u instead of %u for %.*s\n", + j, ml->list, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + continue; + } + /* see NOTE above about why we do not update * to match the master here */ @@ -1711,6 +1791,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); + added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -1799,14 +1880,14 @@ skip_lvb: mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " "exists on this lockres!\n", dlm->name, res->lockname.len, res->lockname.name, - dlm_get_lock_cookie_node(c), - dlm_get_lock_cookie_seq(c)); + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c))); mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " "node=%u, cookie=%u:%llu, queue=%d\n", ml->type, ml->convert_type, ml->node, - dlm_get_lock_cookie_node(ml->cookie), - dlm_get_lock_cookie_seq(ml->cookie), + dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)), ml->list); __dlm_print_one_lock_resource(res); @@ -1817,12 +1898,22 @@ skip_lvb: if (!bad) { dlm_lock_get(newlock); list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(ml->node, res); + added++; } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); leave: + /* balance the ref taken when the work was queued */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + if (ret < 0) { mlog_errno(ret); if (newlock) @@ -1935,9 +2026,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1977,9 +2070,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm_lockres_put(res); } spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -2048,6 +2143,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct list_head *iter, *tmpiter; struct dlm_lock *lock; + unsigned int freed = 0; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2062,6 +2158,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->converting) { @@ -2069,6 +2166,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->blocked) { @@ -2076,9 +2174,23 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + BUG_ON(!test_bit(dead_node, res->refmap)); + dlm_lockres_clear_refmap_bit(dead_node, res); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dead_node, res); + } + /* do not kick thread yet */ __dlm_dirty_lockres(dlm, res); } @@ -2141,9 +2253,21 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); - if (res->owner == dead_node) + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + + /* the wake_up for this will happen when the + * RECOVERING flag is dropped later */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, res); - else if (res->owner == dlm->node_num) { + } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); } @@ -2480,7 +2604,8 @@ retry: return ret; } -int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; @@ -2608,7 +2733,8 @@ stage2: return ret; } -int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 0c822f3ffb0..8ffa0916eb8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -54,9 +54,6 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); - static void dlm_flush_asts(struct dlm_ctxt *dlm); #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) @@ -82,14 +79,33 @@ repeat: current->state = TASK_RUNNING; } - -int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && - list_empty(&res->blocked) && - list_empty(&res->dirty)) - return 1; + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + if (!__dlm_lockres_has_locks(res) && + (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { + /* try not to scan the bitmap unless the first two + * conditions are already true */ + int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* since the bit for dlm->node_num is not + * set, inflight_locks better be zero */ + BUG_ON(res->inflight_locks != 0); + return 1; + } + } return 0; } @@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ - /* For now, just keep any resource we master */ - if (res->owner == dlm->node_num) - { - if (!list_empty(&res->purge)) { - mlog(0, "we master %s:%.*s, but it is on " - "the purge list. Removing\n", - dlm->name, res->lockname.len, - res->lockname.name); - list_del_init(&res->purge); - dlm->purge_count--; - } - return; - } - if (list_empty(&res->purge)) { - mlog(0, "putting lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "putting lockres %.*s:%p onto purge list\n", + res->lockname.len, res->lockname.name, res); res->last_used = jiffies; + dlm_lockres_get(res); list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; - - /* if this node is not the owner, there is - * no way to keep track of who the owner could be. - * unhash it to avoid serious problems. */ - if (res->owner != dlm->node_num) { - mlog(0, "%s:%.*s: doing immediate " - "purge of lockres owned by %u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - - dlm_purge_lockres_now(dlm, res); - } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list, " - "owner=%u\n", res->lockname.len, res->lockname.name, - res->owner); + mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", + res->lockname.len, res->lockname.name, res, res->owner); list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } } @@ -163,68 +154,65 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -/* TODO: Eventual API: Called with the dlm spinlock held, may drop it - * to do migration, but will re-acquire before exit. */ -void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) +static int dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { int master; - int ret; - - spin_lock(&lockres->spinlock); - master = lockres->owner == dlm->node_num; - spin_unlock(&lockres->spinlock); + int ret = 0; - mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, - lockres->lockname.name, master); - - /* Non master is the easy case -- no migration required, just - * quit. */ + spin_lock(&res->spinlock); + if (!__dlm_lockres_unused(res)) { + spin_unlock(&res->spinlock); + mlog(0, "%s:%.*s: tried to purge but not unused\n", + dlm->name, res->lockname.len, res->lockname.name); + return -ENOTEMPTY; + } + master = (res->owner == dlm->node_num); if (!master) - goto finish; - - /* Wheee! Migrate lockres here! */ - spin_unlock(&dlm->spinlock); -again: + res->state |= DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); - ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); - if (ret == -ENOTEMPTY) { - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", - lockres->lockname.len, lockres->lockname.name); + mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, + res->lockname.name, master); - BUG(); - } else if (ret < 0) { - mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", - lockres->lockname.len, lockres->lockname.name); - msleep(100); - goto again; + if (!master) { + spin_lock(&res->spinlock); + /* This ensures that clear refmap is sent after the set */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + spin_unlock(&res->spinlock); + /* drop spinlock to do messaging, retake below */ + spin_unlock(&dlm->spinlock); + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + } + mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", + dlm->name, res->lockname.len, res->lockname.name, ret); + spin_lock(&dlm->spinlock); } - spin_lock(&dlm->spinlock); - -finish: - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); + if (!list_empty(&res->purge)) { + mlog(0, "removing lockres %.*s:%p from purgelist, " + "master = %d\n", res->lockname.len, res->lockname.name, + res, master); + list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } - __dlm_unhash_lockres(lockres); -} - -/* make an unused lockres go away immediately. - * as soon as the dlm spinlock is dropped, this lockres - * will not be found. kfree still happens on last put. */ -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres) -{ - assert_spin_locked(&dlm->spinlock); - assert_spin_locked(&lockres->spinlock); + __dlm_unhash_lockres(res); - BUG_ON(!__dlm_lockres_unused(lockres)); - - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); - dlm->purge_count--; + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); } - __dlm_unhash_lockres(lockres); + return 0; } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -268,13 +256,17 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, break; } + mlog(0, "removing lockres %.*s:%p from purgelist\n", + lockres->lockname.len, lockres->lockname.name, lockres); list_del_init(&lockres->purge); + dlm_lockres_put(lockres); dlm->purge_count--; /* This may drop and reacquire the dlm spinlock if it * has to do migration. */ mlog(0, "calling dlm_purge_lockres!\n"); - dlm_purge_lockres(dlm, lockres); + if (dlm_purge_lockres(dlm, lockres)) + BUG(); mlog(0, "DONE calling dlm_purge_lockres!\n"); /* Avoid adding any scheduling latencies */ @@ -467,12 +459,17 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num) && - !(res->state & DLM_LOCK_RES_DIRTY)) { - /* ref for dirty_list */ - dlm_lockres_get(res); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + if ((res->owner == dlm->node_num)) { + if (res->state & (DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_BLOCK_DIRTY)) + return; + + if (list_empty(&res->dirty)) { + /* ref for dirty_list */ + dlm_lockres_get(res); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } } } @@ -651,7 +648,7 @@ static int dlm_thread(void *data) dlm_lockres_get(res); spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_DIRTY; + /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */ list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); @@ -675,10 +672,11 @@ static int dlm_thread(void *data) /* it is now ok to move lockreses in these states * to the dirty list, assuming that they will only be * dirty for a short while. */ + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_RECOVERING)) { /* move it to the tail and keep going */ + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); mlog(0, "delaying list shuffling for in-" "progress lockres %.*s, state=%d\n", @@ -699,6 +697,7 @@ static int dlm_thread(void *data) /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); + res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); dlm_lockres_calc_usage(dlm, res); @@ -709,11 +708,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { - /* ref for dirty_list */ - dlm_lockres_get(res); spin_lock(&res->spinlock); - list_add_tail(&res->dirty, &dlm->dirty_list); - res->state |= DLM_LOCK_RES_DIRTY; + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); } dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 37be4b2e0d4..86ca085ef32 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -147,6 +147,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, goto leave; } + if (res->state & DLM_LOCK_RES_MIGRATING) { + status = DLM_MIGRATING; + goto leave; + } /* see above for what the spec says about * LKM_CANCEL and the lock queue state */ @@ -244,8 +248,8 @@ leave: /* this should always be coupled with list removal */ BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); mlog(0, "lock %u:%llu should be gone now! refs=%d\n", - dlm_get_lock_cookie_node(lock->ml.cookie), - dlm_get_lock_cookie_seq(lock->ml.cookie), + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), atomic_read(&lock->lock_refs.refcount)-1); dlm_lock_put(lock); } @@ -379,7 +383,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, * return value from dlmunlock_master */ -int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; @@ -502,8 +507,8 @@ not_found: if (!found) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", - dlm_get_lock_cookie_node(unlock->cookie), - dlm_get_lock_cookie_seq(unlock->cookie)); + dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie))); else dlm_lock_put(lock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 10953a508f2..f2cd3bf9efb 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1365,13 +1365,13 @@ bail: return ret; } -struct inode_operations ocfs2_file_iops = { +const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, }; -struct inode_operations ocfs2_special_file_iops = { +const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 601a453f18a..cc973f01f6c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -28,8 +28,8 @@ extern const struct file_operations ocfs2_fops; extern const struct file_operations ocfs2_dops; -extern struct inode_operations ocfs2_file_iops; -extern struct inode_operations ocfs2_special_file_iops; +extern const struct inode_operations ocfs2_file_iops; +extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; enum ocfs2_alloc_restarted { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index e1216364d19..d026b4f2775 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -306,8 +306,8 @@ int ocfs2_journal_dirty_data(handle_t *handle, * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) -/* file update (nlink, etc) + dir entry block */ -#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ +#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f3d7803b4b4..28dd757ff67 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1098,7 +1098,7 @@ static int ocfs2_rename(struct inode *old_dir, BUG(); } - /* Assume a directory heirarchy thusly: + /* Assume a directory hierarchy thusly: * a/b/c * a/d * a,b,c, and d are all directories. @@ -2306,7 +2306,7 @@ leave: return status; } -struct inode_operations ocfs2_dir_iops = { +const struct inode_operations ocfs2_dir_iops = { .create = ocfs2_create, .lookup = ocfs2_lookup, .link = ocfs2_link, diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 8425944fccc..0975c7b7212 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -26,7 +26,7 @@ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H -extern struct inode_operations ocfs2_dir_iops; +extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6e300a88a47..6534f92424d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -116,7 +116,7 @@ static void ocfs2_destroy_inode(struct inode *inode); static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); -static struct super_operations ocfs2_sops = { +static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .destroy_inode = ocfs2_destroy_inode, diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 03b0191534d..40dc1a51f4a 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -170,12 +170,12 @@ bail: return ERR_PTR(status); } -struct inode_operations ocfs2_symlink_inode_operations = { +const struct inode_operations ocfs2_symlink_inode_operations = { .readlink = page_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, }; -struct inode_operations ocfs2_fast_symlink_inode_operations = { +const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h index 1ea9e4d9e9e..65a6c9c6ad5 100644 --- a/fs/ocfs2/symlink.h +++ b/fs/ocfs2/symlink.h @@ -26,8 +26,8 @@ #ifndef OCFS2_SYMLINK_H #define OCFS2_SYMLINK_H -extern struct inode_operations ocfs2_symlink_inode_operations; -extern struct inode_operations ocfs2_fast_symlink_inode_operations; +extern const struct inode_operations ocfs2_symlink_inode_operations; +extern const struct inode_operations ocfs2_fast_symlink_inode_operations; /* * Test whether an inode is a fast symlink. diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 0afd8b9af70..f30e63b9910 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -887,7 +887,7 @@ static inline int ocfs2_translate_response(int response) static int ocfs2_handle_response_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { unsigned int response_id, node_num; int response_status; @@ -943,7 +943,7 @@ bail: static int ocfs2_handle_vote_message(struct o2net_msg *msg, u32 len, - void *data) + void *data, void **ret_data) { int status; struct ocfs2_super *osb = data; @@ -1007,7 +1007,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) osb->net_key, sizeof(struct ocfs2_response_msg), ocfs2_handle_response_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; @@ -1017,7 +1017,7 @@ int ocfs2_register_net_handlers(struct ocfs2_super *osb) osb->net_key, sizeof(struct ocfs2_vote_msg), ocfs2_handle_vote_message, - osb, &osb->osb_net_handlers); + osb, NULL, &osb->osb_net_handlers); if (status) { mlog_errno(status); goto bail; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 99c0bc37ba0..bde1c164417 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -169,7 +169,7 @@ static const struct file_operations openprom_operations = { static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static struct inode_operations openprom_inode_operations = { +static const struct inode_operations openprom_inode_operations = { .lookup = openpromfs_lookup, }; @@ -364,7 +364,7 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations openprom_sops = { +static const struct super_operations openprom_sops = { .alloc_inode = openprom_alloc_inode, .destroy_inode = openprom_destroy_inode, .read_inode = openprom_read_inode, diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 3d73d94d93a..22d38ffc9ef 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -358,14 +358,13 @@ void delete_partition(struct gendisk *disk, int part) p->ios[0] = p->ios[1] = 0; p->sectors[0] = p->sectors[1] = 0; sysfs_remove_link(&p->kobj, "subsystem"); - if (p->holder_dir) - kobject_unregister(p->holder_dir); + kobject_unregister(p->holder_dir); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); kobject_put(&p->kobj); } -void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) +void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) { struct hd_struct *p; @@ -390,6 +389,15 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) if (!disk->part_uevent_suppress) kobject_uevent(&p->kobj, KOBJ_ADD); sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem"); + if (flags & ADDPART_FLAG_WHOLEDISK) { + static struct attribute addpartattr = { + .name = "whole_disk", + .mode = S_IRUSR | S_IRGRP | S_IROTH, + .owner = THIS_MODULE, + }; + + sysfs_create_file(&p->kobj, &addpartattr); + } partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -543,9 +551,9 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) printk(" %s: p%d exceeds device capacity\n", disk->disk_name, p); } - add_partition(disk, p, from, size); + add_partition(disk, p, from, size, state->parts[p].flags); #ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags) + if (state->parts[p].flags & ADDPART_FLAG_RAID) md_autodetect_dev(bdev->bd_dev+p); #endif } @@ -594,10 +602,8 @@ void del_gendisk(struct gendisk *disk) disk->stamp = 0; kobject_uevent(&disk->kobj, KOBJ_REMOVE); - if (disk->holder_dir) - kobject_unregister(disk->holder_dir); - if (disk->slave_dir) - kobject_unregister(disk->slave_dir); + kobject_unregister(disk->holder_dir); + kobject_unregister(disk->slave_dir); if (disk->driverfs_dev) { char *disk_name = make_block_name(disk); sysfs_remove_link(&disk->kobj, "device"); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 8c7af177781..4ccec4cd136 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -63,15 +63,25 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(unsigned char *p, struct block_device *bdev) { + struct partition *pt = (struct partition *) (p + 0x1be); Sector sect; unsigned char *d; - int ret = 0; + int slot, ret = 0; - if (p[0] != AIX_LABEL_MAGIC1 && - p[1] != AIX_LABEL_MAGIC2 && - p[2] != AIX_LABEL_MAGIC3 && - p[3] != AIX_LABEL_MAGIC4) + if (!(p[0] == AIX_LABEL_MAGIC1 && + p[1] == AIX_LABEL_MAGIC2 && + p[2] == AIX_LABEL_MAGIC3 && + p[3] == AIX_LABEL_MAGIC4)) return 0; + /* Assume the partition table is valid if Linux partitions exists */ + for (slot = 1; slot <= 4; slot++, pt++) { + if (pt->sys_ind == LINUX_SWAP_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) + return 0; + } d = read_dev_sector(bdev, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') @@ -155,7 +165,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, put_partition(state, state->next, next, size); if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[state->next].flags = 1; + state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; if (++state->next == state->limit) goto done; diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index 6fa4ff89510..ed5ac83fe83 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -72,7 +72,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) if (blocks) { put_partition(state, slot, start, blocks); if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; } slot++; } diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index 0a5927c806c..123f8b46c8b 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -80,8 +80,11 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) num_sectors = be32_to_cpu(p->num_sectors); if (num_sectors) { put_partition(state, slot, st_sector, num_sectors); + state->parts[slot].flags = 0; if (label->infos[i].id == LINUX_RAID_PARTITION) - state->parts[slot].flags = 1; + state->parts[slot].flags |= ADDPART_FLAG_RAID; + if (label->infos[i].id == SUN_WHOLE_DISK) + state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; } slot++; } diff --git a/fs/pipe.c b/fs/pipe.c index 68090e84f58..ebafde7d6ab 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -16,6 +16,7 @@ #include <linux/uio.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -985,6 +986,10 @@ int do_pipe(int *fd) goto err_fdr; fdw = error; + error = audit_fd_pair(fdr, fdw); + if (error < 0) + goto err_fdw; + fd_install(fdr, fr); fd_install(fdw, fw); fd[0] = fdr; @@ -992,6 +997,8 @@ int do_pipe(int *fd) return 0; + err_fdw: + put_unused_fd(fdw); err_fdr: put_unused_fd(fdr); err_read_pipe: diff --git a/fs/proc/Makefile b/fs/proc/Makefile index f6c77627257..a6b3a8f878f 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,7 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o proc_misc.o + proc_tty.o proc_misc.o proc_sysctl.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 70e4fab117b..07c9cdbcdca 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -351,7 +351,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) struct signal_struct *sig = task->signal; if (sig->tty) { - tty_pgrp = sig->tty->pgrp; + tty_pgrp = pid_nr(sig->tty->pgrp); tty_nr = new_encode_dev(tty_devnum(sig->tty)); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 1a979ea3b37..4f5745af8c1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -93,8 +93,8 @@ struct pid_entry { int len; char *name; mode_t mode; - struct inode_operations *iop; - struct file_operations *fop; + const struct inode_operations *iop; + const struct file_operations *fop; union proc_op op; }; @@ -352,7 +352,7 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr) return error; } -static struct inode_operations proc_def_inode_operations = { +static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; @@ -424,7 +424,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) return res; } -static struct file_operations proc_mounts_operations = { +static const struct file_operations proc_mounts_operations = { .open = mounts_open, .read = seq_read, .llseek = seq_lseek, @@ -462,7 +462,7 @@ static int mountstats_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations proc_mountstats_operations = { +static const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read = seq_read, .llseek = seq_lseek, @@ -501,7 +501,7 @@ out_no_task: return length; } -static struct file_operations proc_info_file_operations = { +static const struct file_operations proc_info_file_operations = { .read = proc_info_read, }; @@ -646,7 +646,7 @@ static loff_t mem_lseek(struct file * file, loff_t offset, int orig) return file->f_pos; } -static struct file_operations proc_mem_operations = { +static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, @@ -710,7 +710,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return end - buffer; } -static struct file_operations proc_oom_adjust_operations = { +static const struct file_operations proc_oom_adjust_operations = { .read = oom_adjust_read, .write = oom_adjust_write, }; @@ -777,7 +777,7 @@ out_free_page: return length; } -static struct file_operations proc_loginuid_operations = { +static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, }; @@ -849,7 +849,7 @@ out_no_task: return result; } -static struct file_operations proc_seccomp_operations = { +static const struct file_operations proc_seccomp_operations = { .read = seccomp_read, .write = seccomp_write, }; @@ -908,7 +908,7 @@ static ssize_t proc_fault_inject_write(struct file * file, return end - buffer; } -static struct file_operations proc_fault_inject_operations = { +static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, }; @@ -980,7 +980,7 @@ out: return error; } -static struct inode_operations proc_pid_link_inode_operations = { +static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, @@ -1408,7 +1408,7 @@ out_no_task: return retval; } -static struct file_operations proc_fd_operations = { +static const struct file_operations proc_fd_operations = { .read = generic_read_dir, .readdir = proc_readfd, }; @@ -1416,7 +1416,7 @@ static struct file_operations proc_fd_operations = { /* * proc directories can do almost nothing.. */ -static struct inode_operations proc_fd_inode_operations = { +static const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .setattr = proc_setattr, }; @@ -1623,7 +1623,7 @@ out_no_task: return length; } -static struct file_operations proc_pid_attr_operations = { +static const struct file_operations proc_pid_attr_operations = { .read = proc_pid_attr_read, .write = proc_pid_attr_write, }; @@ -1644,7 +1644,7 @@ static int proc_attr_dir_readdir(struct file * filp, attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); } -static struct file_operations proc_attr_dir_operations = { +static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, .readdir = proc_attr_dir_readdir, }; @@ -1656,7 +1656,7 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } -static struct inode_operations proc_attr_dir_inode_operations = { +static const struct inode_operations proc_attr_dir_inode_operations = { .lookup = proc_attr_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, @@ -1682,7 +1682,7 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) return ERR_PTR(vfs_follow_link(nd,tmp)); } -static struct inode_operations proc_self_inode_operations = { +static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, .follow_link = proc_self_follow_link, }; @@ -1810,17 +1810,21 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filld static int proc_pid_io_accounting(struct task_struct *task, char *buffer) { return sprintf(buffer, +#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" +#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", +#ifdef CONFIG_TASK_XACCT (unsigned long long)task->rchar, (unsigned long long)task->wchar, (unsigned long long)task->syscr, (unsigned long long)task->syscw, +#endif (unsigned long long)task->ioac.read_bytes, (unsigned long long)task->ioac.write_bytes, (unsigned long long)task->ioac.cancelled_write_bytes); @@ -1830,8 +1834,8 @@ static int proc_pid_io_accounting(struct task_struct *task, char *buffer) /* * Thread groups */ -static struct file_operations proc_task_operations; -static struct inode_operations proc_task_inode_operations; +static const struct file_operations proc_task_operations; +static const struct inode_operations proc_task_inode_operations; static struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, task), @@ -1890,7 +1894,7 @@ static int proc_tgid_base_readdir(struct file * filp, tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); } -static struct file_operations proc_tgid_base_operations = { +static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, .readdir = proc_tgid_base_readdir, }; @@ -1900,7 +1904,7 @@ static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *de tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } -static struct inode_operations proc_tgid_base_inode_operations = { +static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, @@ -2173,12 +2177,12 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } -static struct file_operations proc_tid_base_operations = { +static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, .readdir = proc_tid_base_readdir, }; -static struct inode_operations proc_tid_base_inode_operations = { +static const struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, @@ -2404,13 +2408,13 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } -static struct inode_operations proc_task_inode_operations = { +static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, }; -static struct file_operations proc_task_operations = { +static const struct file_operations proc_task_operations = { .read = generic_read_dir, .readdir = proc_task_readdir, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 853cb877d5f..775fb21294d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -32,14 +32,14 @@ static loff_t proc_file_lseek(struct file *, loff_t, int); DEFINE_SPINLOCK(proc_subdir_lock); -int proc_match(int len, const char *name, struct proc_dir_entry *de) +static int proc_match(int len, const char *name, struct proc_dir_entry *de) { if (de->namelen != len) return 0; return !memcmp(name, de->name, len); } -static struct file_operations proc_file_operations = { +static const struct file_operations proc_file_operations = { .llseek = proc_file_lseek, .read = proc_file_read, .write = proc_file_write, @@ -265,7 +265,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static struct inode_operations proc_file_inode_operations = { +static const struct inode_operations proc_file_inode_operations = { .setattr = proc_notify_change, }; @@ -357,7 +357,7 @@ static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -static struct inode_operations proc_link_inode_operations = { +static const struct inode_operations proc_link_inode_operations = { .readlink = generic_readlink, .follow_link = proc_follow_link, }; @@ -497,7 +497,7 @@ out: unlock_kernel(); * use the in-memory "struct proc_dir_entry" tree to parse * the /proc directory. */ -static struct file_operations proc_dir_operations = { +static const struct file_operations proc_dir_operations = { .read = generic_read_dir, .readdir = proc_readdir, }; @@ -505,7 +505,7 @@ static struct file_operations proc_dir_operations = { /* * proc directories can do almost nothing.. */ -static struct inode_operations proc_dir_inode_operations = { +static const struct inode_operations proc_dir_inode_operations = { .lookup = proc_lookup, .getattr = proc_getattr, .setattr = proc_notify_change, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e26945ba685..c372eb151a3 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -132,7 +132,7 @@ static int proc_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations proc_sops = { +static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .read_inode = proc_read_inode, @@ -161,6 +161,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, if (!inode) goto out_ino; + PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; if (de) { if (de->mode) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 987c773dbb2..c932aa65e19 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,8 @@ #include <linux/proc_fs.h> +extern int proc_sys_init(void); + struct vmalloc_info { unsigned long used; unsigned long largest_chunk; @@ -38,13 +40,13 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); -extern struct file_operations proc_maps_operations; -extern struct file_operations proc_numa_maps_operations; -extern struct file_operations proc_smaps_operations; +extern const struct file_operations proc_maps_operations; +extern const struct file_operations proc_numa_maps_operations; +extern const struct file_operations proc_smaps_operations; -extern struct file_operations proc_maps_operations; -extern struct file_operations proc_numa_maps_operations; -extern struct file_operations proc_smaps_operations; +extern const struct file_operations proc_maps_operations; +extern const struct file_operations proc_numa_maps_operations; +extern const struct file_operations proc_smaps_operations; void free_proc_entry(struct proc_dir_entry *de); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 5ec67257e5f..22f789de390 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -128,7 +128,7 @@ static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) return seq_open(file, &proc_nommu_vma_list_seqop); } -static struct file_operations proc_nommu_vma_list_operations = { +static const struct file_operations proc_nommu_vma_list_operations = { .open = proc_nommu_vma_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index b37ce33f67e..e2c4c0a5c90 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -121,16 +121,11 @@ static int meminfo_read_proc(char *page, char **start, off_t off, { struct sysinfo i; int len; - unsigned long inactive; - unsigned long active; - unsigned long free; unsigned long committed; unsigned long allowed; struct vmalloc_info vmi; long cached; - get_zone_counts(&active, &inactive, &free); - /* * display in kilobytes. */ @@ -187,8 +182,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(i.bufferram), K(cached), K(total_swapcache_pages), - K(active), - K(inactive), + K(global_page_state(NR_ACTIVE)), + K(global_page_state(NR_INACTIVE)), #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), @@ -228,7 +223,7 @@ static int fragmentation_open(struct inode *inode, struct file *file) return seq_open(file, &fragmentation_op); } -static struct file_operations fragmentation_file_operations = { +static const struct file_operations fragmentation_file_operations = { .open = fragmentation_open, .read = seq_read, .llseek = seq_lseek, @@ -241,7 +236,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file) return seq_open(file, &zoneinfo_op); } -static struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations proc_zoneinfo_file_operations = { .open = zoneinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -266,7 +261,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) return seq_open(file, &cpuinfo_op); } -static struct file_operations proc_cpuinfo_operations = { +static const struct file_operations proc_cpuinfo_operations = { .open = cpuinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -325,7 +320,7 @@ static int devinfo_open(struct inode *inode, struct file *filp) return seq_open(filp, &devinfo_ops); } -static struct file_operations proc_devinfo_operations = { +static const struct file_operations proc_devinfo_operations = { .open = devinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -337,7 +332,7 @@ static int vmstat_open(struct inode *inode, struct file *file) { return seq_open(file, &vmstat_op); } -static struct file_operations proc_vmstat_file_operations = { +static const struct file_operations proc_vmstat_file_operations = { .open = vmstat_open, .read = seq_read, .llseek = seq_lseek, @@ -368,7 +363,7 @@ static int partitions_open(struct inode *inode, struct file *file) { return seq_open(file, &partitions_op); } -static struct file_operations proc_partitions_operations = { +static const struct file_operations proc_partitions_operations = { .open = partitions_open, .read = seq_read, .llseek = seq_lseek, @@ -380,7 +375,7 @@ static int diskstats_open(struct inode *inode, struct file *file) { return seq_open(file, &diskstats_op); } -static struct file_operations proc_diskstats_operations = { +static const struct file_operations proc_diskstats_operations = { .open = diskstats_open, .read = seq_read, .llseek = seq_lseek, @@ -394,7 +389,7 @@ static int modules_open(struct inode *inode, struct file *file) { return seq_open(file, &modules_op); } -static struct file_operations proc_modules_operations = { +static const struct file_operations proc_modules_operations = { .open = modules_open, .read = seq_read, .llseek = seq_lseek, @@ -409,7 +404,7 @@ static int slabinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &slabinfo_op); } -static struct file_operations proc_slabinfo_operations = { +static const struct file_operations proc_slabinfo_operations = { .open = slabinfo_open, .read = seq_read, .write = slabinfo_write, @@ -443,7 +438,7 @@ static int slabstats_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations proc_slabstats_operations = { +static const struct file_operations proc_slabstats_operations = { .open = slabstats_open, .read = seq_read, .llseek = seq_lseek, @@ -556,7 +551,7 @@ static int stat_open(struct inode *inode, struct file *file) kfree(buf); return res; } -static struct file_operations proc_stat_operations = { +static const struct file_operations proc_stat_operations = { .open = stat_open, .read = seq_read, .llseek = seq_lseek, @@ -598,7 +593,7 @@ static int interrupts_open(struct inode *inode, struct file *filp) return seq_open(filp, &int_seq_ops); } -static struct file_operations proc_interrupts_operations = { +static const struct file_operations proc_interrupts_operations = { .open = interrupts_open, .read = seq_read, .llseek = seq_lseek, @@ -655,7 +650,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, return count; } -static struct file_operations proc_sysrq_trigger_operations = { +static const struct file_operations proc_sysrq_trigger_operations = { .write = write_sysrq_trigger, }; #endif @@ -672,7 +667,6 @@ void create_seq_entry(char *name, mode_t mode, const struct file_operations *f) void __init proc_misc_init(void) { - struct proc_dir_entry *entry; static struct { char *name; int (*read_proc)(char*,char**,off_t,int,int*,void*); @@ -700,9 +694,12 @@ void __init proc_misc_init(void) /* And now for trickier ones */ #ifdef CONFIG_PRINTK - entry = create_proc_entry("kmsg", S_IRUSR, &proc_root); - if (entry) - entry->proc_fops = &proc_kmsg_operations; + { + struct proc_dir_entry *entry; + entry = create_proc_entry("kmsg", S_IRUSR, &proc_root); + if (entry) + entry->proc_fops = &proc_kmsg_operations; + } #endif create_seq_entry("devices", 0, &proc_devinfo_operations); create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); @@ -743,8 +740,11 @@ void __init proc_misc_init(void) proc_vmcore->proc_fops = &proc_vmcore_operations; #endif #ifdef CONFIG_MAGIC_SYSRQ - entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); - if (entry) - entry->proc_fops = &proc_sysrq_trigger_operations; + { + struct proc_dir_entry *entry; + entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); + if (entry) + entry->proc_fops = &proc_sysrq_trigger_operations; + } #endif } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c new file mode 100644 index 00000000000..20e8cbb3436 --- /dev/null +++ b/fs/proc/proc_sysctl.c @@ -0,0 +1,479 @@ +/* + * /proc/sys support + */ + +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include "internal.h" + +static struct dentry_operations proc_sys_dentry_operations; +static const struct file_operations proc_sys_file_operations; +static struct inode_operations proc_sys_inode_operations; + +static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table) +{ + /* Refresh the cached information bits in the inode */ + if (table) { + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_mode = table->mode; + if (table->proc_handler) { + inode->i_mode |= S_IFREG; + inode->i_nlink = 1; + } else { + inode->i_mode |= S_IFDIR; + inode->i_nlink = 0; /* It is too hard to figure out */ + } + } +} + +static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table) +{ + struct inode *inode; + struct proc_inode *dir_ei, *ei; + int depth; + + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* A directory is always one deeper than it's parent */ + dir_ei = PROC_I(dir); + depth = dir_ei->fd + 1; + + ei = PROC_I(inode); + ei->fd = depth; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ + proc_sys_refresh_inode(inode, table); +out: + return inode; +} + +static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth) +{ + for (;;) { + struct proc_inode *ei; + + ei = PROC_I(dentry->d_inode); + if (ei->fd == depth) + break; /* found */ + + dentry = dentry->d_parent; + } + return dentry; +} + +static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table, + struct qstr *name) +{ + int len; + for ( ; table->ctl_name || table->procname; table++) { + + if (!table->procname) + continue; + + len = strlen(table->procname); + if (len != name->len) + continue; + + if (memcmp(table->procname, name->name, len) != 0) + continue; + + /* I have a match */ + return table; + } + return NULL; +} + +static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry, + struct ctl_table *table) +{ + struct dentry *ancestor; + struct proc_inode *ei; + int depth, i; + + ei = PROC_I(dentry->d_inode); + depth = ei->fd; + + if (depth == 0) + return table; + + for (i = 1; table && (i <= depth); i++) { + ancestor = proc_sys_ancestor(dentry, i); + table = proc_sys_lookup_table_one(table, &ancestor->d_name); + if (table) + table = table->child; + } + return table; + +} +static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent, + struct qstr *name, + struct ctl_table *table) +{ + table = proc_sys_lookup_table(dparent, table); + if (table) + table = proc_sys_lookup_table_one(table, name); + return table; +} + +static struct ctl_table *do_proc_sys_lookup(struct dentry *parent, + struct qstr *name, + struct ctl_table_header **ptr) +{ + struct ctl_table_header *head; + struct ctl_table *table = NULL; + + for (head = sysctl_head_next(NULL); head; + head = sysctl_head_next(head)) { + table = proc_sys_lookup_entry(parent, name, head->ctl_table); + if (table) + break; + } + *ptr = head; + return table; +} + +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ctl_table_header *head; + struct inode *inode; + struct dentry *err; + struct ctl_table *table; + + err = ERR_PTR(-ENOENT); + table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + if (!table) + goto out; + + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir, table); + if (!inode) + goto out; + + err = NULL; + dentry->d_op = &proc_sys_dentry_operations; + d_add(dentry, inode); + +out: + sysctl_head_finish(head); + return err; +} + +static ssize_t proc_sys_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = filp->f_dentry; + struct ctl_table_header *head; + struct ctl_table *table; + ssize_t error, res; + + table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + /* Has the sysctl entry disappeared on us? */ + error = -ENOENT; + if (!table) + goto out; + + /* Has the sysctl entry been replaced by a directory? */ + error = -EISDIR; + if (!table->proc_handler) + goto out; + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(table, MAY_READ)) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = table->proc_handler(table, 0, filp, buf, &res, ppos); + if (!error) + error = res; +out: + sysctl_head_finish(head); + + return error; +} + +static ssize_t proc_sys_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = filp->f_dentry; + struct ctl_table_header *head; + struct ctl_table *table; + ssize_t error, res; + + table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + /* Has the sysctl entry disappeared on us? */ + error = -ENOENT; + if (!table) + goto out; + + /* Has the sysctl entry been replaced by a directory? */ + error = -EISDIR; + if (!table->proc_handler) + goto out; + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(table, MAY_WRITE)) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = table->proc_handler(table, 1, filp, (char __user *)buf, + &res, ppos); + if (!error) + error = res; +out: + sysctl_head_finish(head); + + return error; +} + + +static int proc_sys_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct ctl_table *table) +{ + struct ctl_table_header *head; + struct ctl_table *child_table = NULL; + struct dentry *child, *dir = filp->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + int ret; + + qname.name = table->procname; + qname.len = strlen(table->procname); + qname.hash = full_name_hash(qname.name, qname.len); + + /* Suppress duplicates. + * Only fill a directory entry if it is the value that + * an ordinary lookup of that name returns. Hide all + * others. + * + * If we ever cache this translation in the dcache + * I should do a dcache lookup first. But for now + * it is just simpler not to. + */ + ret = 0; + child_table = do_proc_sys_lookup(dir, &qname, &head); + sysctl_head_finish(head); + if (child_table != table) + return 0; + + child = d_lookup(dir, &qname); + if (!child) { + struct dentry *new; + new = d_alloc(dir, &qname); + if (new) { + inode = proc_sys_make_inode(dir->d_inode, table); + if (!inode) + child = ERR_PTR(-ENOMEM); + else { + new->d_op = &proc_sys_dentry_operations; + d_add(new, inode); + } + if (child) + dput(new); + else + child = new; + } + } + if (!child || IS_ERR(child) || !child->d_inode) + goto end_instantiate; + inode = child->d_inode; + if (inode) { + ino = inode->i_ino; + type = inode->i_mode >> 12; + } + dput(child); +end_instantiate: + if (!ino) + ino= find_inode_number(dir, &qname); + if (!ino) + ino = 1; + return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); +} + +static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = NULL; + struct ctl_table *table; + unsigned long pos; + int ret; + + ret = -ENOTDIR; + if (!S_ISDIR(inode->i_mode)) + goto out; + + ret = 0; + /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, + inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + } + if (filp->f_pos == 1) { + if (filldir(dirent, "..", 2, filp->f_pos, + parent_ino(dentry), DT_DIR) < 0) + goto out; + filp->f_pos++; + } + pos = 2; + + /* - Find each instance of the directory + * - Read all entries in each instance + * - Before returning an entry to user space lookup the entry + * by name and if I find a different entry don't return + * this one because it means it is a buried dup. + * For sysctl this should only happen for directory entries. + */ + for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) { + table = proc_sys_lookup_table(dentry, head->ctl_table); + + if (!table) + continue; + + for (; table->ctl_name || table->procname; table++, pos++) { + /* Can't do anything without a proc name */ + if (!table->procname) + continue; + + if (pos < filp->f_pos) + continue; + + if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0) + goto out; + filp->f_pos = pos + 1; + } + } + ret = 1; +out: + sysctl_head_finish(head); + return ret; +} + +static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + /* + * sysctl entries that are not writeable, + * are _NOT_ writeable, capabilities or not. + */ + struct ctl_table_header *head; + struct ctl_table *table; + struct dentry *dentry; + int mode; + int depth; + int error; + + head = NULL; + depth = PROC_I(inode)->fd; + + /* First check the cached permissions, in case we don't have + * enough information to lookup the sysctl table entry. + */ + error = -EACCES; + mode = inode->i_mode; + + if (current->euid == 0) + mode >>= 6; + else if (in_group_p(0)) + mode >>= 3; + + if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask) + error = 0; + + /* If we can't get a sysctl table entry the permission + * checks on the cached mode will have to be enough. + */ + if (!nd || !depth) + goto out; + + dentry = nd->dentry; + table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + + /* If the entry does not exist deny permission */ + error = -EACCES; + if (!table) + goto out; + + /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(table, mask); +out: + sysctl_head_finish(head); + return error; +} + +static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (!error) { + error = security_inode_setattr(dentry, attr); + if (!error) + error = inode_setattr(inode, attr); + } + + return error; +} + +/* I'm lazy and don't distinguish between files and directories, + * until access time. + */ +static const struct file_operations proc_sys_file_operations = { + .read = proc_sys_read, + .write = proc_sys_write, + .readdir = proc_sys_readdir, +}; + +static struct inode_operations proc_sys_inode_operations = { + .lookup = proc_sys_lookup, + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, +}; + +static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct ctl_table_header *head; + struct ctl_table *table; + table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); + proc_sys_refresh_inode(dentry->d_inode, table); + sysctl_head_finish(head); + return !!table; +} + +static struct dentry_operations proc_sys_dentry_operations = { + .d_revalidate = proc_sys_revalidate, +}; + +static struct proc_dir_entry *proc_sys_root; + +int proc_sys_init(void) +{ + proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root->proc_iops = &proc_sys_inode_operations; + proc_sys_root->proc_fops = &proc_sys_file_operations; + proc_sys_root->nlink = 0; + return 0; +} diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 15c4455b09e..c1bbfbeb035 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -138,7 +138,7 @@ static int tty_drivers_open(struct inode *inode, struct file *file) return seq_open(file, &tty_drivers_op); } -static struct file_operations proc_tty_drivers_operations = { +static const struct file_operations proc_tty_drivers_operations = { .open = tty_drivers_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/proc/root.c b/fs/proc/root.c index 64d242b6dcf..5834a744c2a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,10 +23,6 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; -#ifdef CONFIG_SYSCTL -struct proc_dir_entry *proc_sys_root; -#endif - static int proc_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { @@ -71,13 +67,6 @@ void __init proc_root_init(void) #ifdef CONFIG_SYSVIPC proc_mkdir("sysvipc", NULL); #endif -#ifdef CONFIG_SYSCTL - proc_sys_root = proc_mkdir("sys", NULL); -#endif -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - proc_mkdir("sys/fs", NULL); - proc_mkdir("sys/fs/binfmt_misc", NULL); -#endif proc_root_fs = proc_mkdir("fs", NULL); proc_root_driver = proc_mkdir("driver", NULL); proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ @@ -90,6 +79,9 @@ void __init proc_root_init(void) proc_device_tree_init(); #endif proc_bus = proc_mkdir("bus", NULL); +#ifdef CONFIG_SYSCTL + proc_sys_init(); +#endif } static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat @@ -136,7 +128,7 @@ static int proc_root_readdir(struct file * filp, * <pid> directories. Thus we don't use the generic * directory handling functions for that.. */ -static struct file_operations proc_root_operations = { +static const struct file_operations proc_root_operations = { .read = generic_read_dir, .readdir = proc_root_readdir, }; @@ -144,7 +136,7 @@ static struct file_operations proc_root_operations = { /* * proc root can do almost nothing.. */ -static struct inode_operations proc_root_inode_operations = { +static const struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, .getattr = proc_root_getattr, }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 55ade0d1562..7445980c802 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -434,7 +434,7 @@ static int maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } -struct file_operations proc_maps_operations = { +const struct file_operations proc_maps_operations = { .open = maps_open, .read = seq_read, .llseek = seq_lseek, @@ -456,7 +456,7 @@ static int numa_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_numa_maps_op); } -struct file_operations proc_numa_maps_operations = { +const struct file_operations proc_numa_maps_operations = { .open = numa_maps_open, .read = seq_read, .llseek = seq_lseek, @@ -469,7 +469,7 @@ static int smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } -struct file_operations proc_smaps_operations = { +const struct file_operations proc_smaps_operations = { .open = smaps_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index fcc5caf93f5..7cddf6b8635 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -220,7 +220,7 @@ static int maps_open(struct inode *inode, struct file *file) return ret; } -struct file_operations proc_maps_operations = { +const struct file_operations proc_maps_operations = { .open = maps_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index c94db1db7a7..ea9ffefb48a 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -87,7 +87,7 @@ const struct file_operations qnx4_dir_operations = .fsync = file_fsync, }; -struct inode_operations qnx4_dir_inode_operations = +const struct inode_operations qnx4_dir_inode_operations = { .lookup = qnx4_lookup, #ifdef CONFIG_QNX4FS_RW diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c index 467e5ac7280..44649981bbc 100644 --- a/fs/qnx4/file.c +++ b/fs/qnx4/file.c @@ -33,7 +33,7 @@ const struct file_operations qnx4_file_operations = #endif }; -struct inode_operations qnx4_file_inode_operations = +const struct inode_operations qnx4_file_inode_operations = { #ifdef CONFIG_QNX4FS_RW .truncate = qnx4_truncate, diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index c047dc654d5..83bc8e7824c 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -30,7 +30,7 @@ #define QNX4_VERSION 4 #define QNX4_BMNAME ".bitmap" -static struct super_operations qnx4_sops; +static const struct super_operations qnx4_sops; #ifdef CONFIG_QNX4FS_RW @@ -129,7 +129,7 @@ static void qnx4_read_inode(struct inode *); static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); -static struct super_operations qnx4_sops = +static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .destroy_inode = qnx4_destroy_inode, diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 54ebbc84207..2f14774a124 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -31,7 +31,7 @@ const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, .prepare_write = simple_prepare_write, .commit_write = simple_commit_write, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = __set_page_dirty_no_writeback, }; const struct file_operations ramfs_file_operations = { @@ -45,6 +45,6 @@ const struct file_operations ramfs_file_operations = { .llseek = generic_file_llseek, }; -struct inode_operations ramfs_file_inode_operations = { +const struct inode_operations ramfs_file_inode_operations = { .getattr = simple_getattr, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index e9d6c473328..d3fd7c6732d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -32,7 +32,7 @@ const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, .prepare_write = simple_prepare_write, .commit_write = simple_commit_write, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = __set_page_dirty_no_writeback, }; const struct file_operations ramfs_file_operations = { @@ -47,7 +47,7 @@ const struct file_operations ramfs_file_operations = { .llseek = generic_file_llseek, }; -struct inode_operations ramfs_file_inode_operations = { +const struct inode_operations ramfs_file_inode_operations = { .setattr = ramfs_nommu_setattr, .getattr = simple_getattr, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 2faf4cdf61b..ff1f7639707 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -40,8 +40,8 @@ /* some random number */ #define RAMFS_MAGIC 0x858458f6 -static struct super_operations ramfs_ops; -static struct inode_operations ramfs_dir_inode_operations; +static const struct super_operations ramfs_ops; +static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ @@ -143,7 +143,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } -static struct inode_operations ramfs_dir_inode_operations = { +static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, .link = simple_link, @@ -155,7 +155,7 @@ static struct inode_operations ramfs_dir_inode_operations = { .rename = simple_rename, }; -static struct super_operations ramfs_ops = { +static const struct super_operations ramfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h index c2bb58e7465..af7cc074a47 100644 --- a/fs/ramfs/internal.h +++ b/fs/ramfs/internal.h @@ -12,4 +12,4 @@ extern const struct address_space_operations ramfs_aops; extern const struct file_operations ramfs_file_operations; -extern struct inode_operations ramfs_file_inode_operations; +extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c index 707ac21700d..1f8dc373ede 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -197,13 +197,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count struct inode *inode; loff_t pos; + inode = file->f_path.dentry->d_inode; if (unlikely((ssize_t) count < 0)) goto Einval; pos = *ppos; if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) goto Einval; - inode = file->f_path.dentry->d_inode; if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, @@ -274,9 +274,9 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) ret = do_sync_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file->f_path.dentry); - current->rchar += ret; + add_rchar(current, ret); } - current->syscr++; + inc_syscr(current); } } @@ -332,9 +332,9 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ ret = do_sync_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file->f_path.dentry); - current->wchar += ret; + add_wchar(current, ret); } - current->syscw++; + inc_syscw(current); } } @@ -675,8 +675,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) } if (ret > 0) - current->rchar += ret; - current->syscr++; + add_rchar(current, ret); + inc_syscr(current); return ret; } @@ -696,8 +696,8 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) } if (ret > 0) - current->wchar += ret; - current->syscw++; + add_wchar(current, ret); + inc_syscw(current); return ret; } @@ -779,12 +779,12 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); if (retval > 0) { - current->rchar += retval; - current->wchar += retval; + add_rchar(current, retval); + add_wchar(current, retval); } - current->syscr++; - current->syscw++; + inc_syscr(current); + inc_syscw(current); if (*ppos > max) retval = -EOVERFLOW; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index fba304e64de..f85c5cf4934 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -19,6 +19,7 @@ #include <linux/time.h> #include <linux/reiserfs_fs.h> #include <linux/buffer_head.h> +#include <linux/kernel.h> #ifdef CONFIG_REISERFS_CHECK @@ -1756,7 +1757,7 @@ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) if (buffer_dirty(bh)) reiserfs_warning(tb->tb_sb, "store_thrown deals with dirty buffer"); - for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++) + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) if (!tb->thrown[i]) { tb->thrown[i] = bh; get_bh(bh); /* free_thrown puts this */ @@ -1769,7 +1770,7 @@ static void free_thrown(struct tree_balance *tb) { int i; b_blocknr_t blocknr; - for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++) { + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) { if (tb->thrown[i]) { blocknr = tb->thrown[i]->b_blocknr; if (buffer_dirty(tb->thrown[i])) diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 5109f1d5e7f..abfada2f52d 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1556,7 +1556,7 @@ const struct file_operations reiserfs_file_operations = { .splice_write = generic_file_splice_write, }; -struct inode_operations reiserfs_file_inode_operations = { +const struct inode_operations reiserfs_file_inode_operations = { .truncate = reiserfs_vfs_truncate_file, .setattr = reiserfs_setattr, .setxattr = reiserfs_setxattr, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 23f5cd5bbf5..a2161840bc7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1525,7 +1525,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* * directories can handle most operations... */ -struct inode_operations reiserfs_dir_inode_operations = { +const struct inode_operations reiserfs_dir_inode_operations = { //&reiserfs_dir_operations, /* default_file_ops */ .create = reiserfs_create, .lookup = reiserfs_lookup, @@ -1548,7 +1548,7 @@ struct inode_operations reiserfs_dir_inode_operations = { * symlink operations.. same as page_symlink_inode_operations, with xattr * stuff added */ -struct inode_operations reiserfs_symlink_inode_operations = { +const struct inode_operations reiserfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -1564,7 +1564,7 @@ struct inode_operations reiserfs_symlink_inode_operations = { /* * special file operations.. just xattr/acl stuff */ -struct inode_operations reiserfs_special_inode_operations = { +const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, .setxattr = reiserfs_setxattr, .getxattr = reiserfs_getxattr, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 58ad4551a7c..f13a7f164dc 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -593,7 +593,7 @@ static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t); #endif -static struct super_operations reiserfs_sops = { +static const struct super_operations reiserfs_sops = { .alloc_inode = reiserfs_alloc_inode, .destroy_inode = reiserfs_destroy_inode, .write_inode = reiserfs_write_inode, diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index d3e243a6f60..fd601014813 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -110,7 +110,7 @@ romfs_checksum(void *data, int size) return sum; } -static struct super_operations romfs_ops; +static const struct super_operations romfs_ops; static int romfs_fill_super(struct super_block *s, void *data, int silent) { @@ -468,7 +468,7 @@ static const struct file_operations romfs_dir_operations = { .readdir = romfs_readdir, }; -static struct inode_operations romfs_dir_inode_operations = { +static const struct inode_operations romfs_dir_inode_operations = { .lookup = romfs_lookup, }; @@ -598,7 +598,7 @@ static int romfs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations romfs_ops = { +static const struct super_operations romfs_ops = { .alloc_inode = romfs_alloc_inode, .destroy_inode = romfs_destroy_inode, .read_inode = romfs_read_inode, diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index b1e58d1ac9c..50136b1a3ec 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -42,7 +42,7 @@ const struct file_operations smb_dir_operations = .open = smb_dir_open, }; -struct inode_operations smb_dir_inode_operations = +const struct inode_operations smb_dir_inode_operations = { .create = smb_create, .lookup = smb_lookup, @@ -54,7 +54,7 @@ struct inode_operations smb_dir_inode_operations = .setattr = smb_notify_change, }; -struct inode_operations smb_dir_inode_operations_unix = +const struct inode_operations smb_dir_inode_operations_unix = { .create = smb_create, .lookup = smb_lookup, diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index e50533a7951..f161797160c 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -418,7 +418,7 @@ const struct file_operations smb_file_operations = .sendfile = smb_file_sendfile, }; -struct inode_operations smb_file_inode_operations = +const struct inode_operations smb_file_inode_operations = { .permission = smb_file_permission, .getattr = smb_getattr, diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 84dfe3f3482..5faba4f1c9a 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -98,7 +98,7 @@ static int smb_remount(struct super_block *sb, int *flags, char *data) return 0; } -static struct super_operations smb_sops = +static const struct super_operations smb_sops = { .alloc_inode = smb_alloc_inode, .destroy_inode = smb_destroy_inode, diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h index 34fb462b237..03f456c1b7d 100644 --- a/fs/smbfs/proto.h +++ b/fs/smbfs/proto.h @@ -36,8 +36,8 @@ extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, stru extern void smb_install_null_ops(struct smb_ops *ops); /* dir.c */ extern const struct file_operations smb_dir_operations; -extern struct inode_operations smb_dir_inode_operations; -extern struct inode_operations smb_dir_inode_operations_unix; +extern const struct inode_operations smb_dir_inode_operations; +extern const struct inode_operations smb_dir_inode_operations_unix; extern void smb_new_dentry(struct dentry *dentry); extern void smb_renew_times(struct dentry *dentry); /* cache.c */ @@ -65,7 +65,7 @@ extern int smb_notify_change(struct dentry *dentry, struct iattr *attr); /* file.c */ extern const struct address_space_operations smb_file_aops; extern const struct file_operations smb_file_operations; -extern struct inode_operations smb_file_inode_operations; +extern const struct inode_operations smb_file_inode_operations; /* ioctl.c */ extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); /* smbiod.c */ @@ -84,4 +84,4 @@ extern int smb_request_send_server(struct smb_sb_info *server); extern int smb_request_recv(struct smb_sb_info *server); /* symlink.c */ extern int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname); -extern struct inode_operations smb_link_inode_operations; +extern const struct inode_operations smb_link_inode_operations; diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index a4bcae8a9af..42261dbdf60 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -61,7 +61,7 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server, struct smb_request *req; unsigned char *buf = NULL; - req = kmem_cache_alloc(req_cachep, GFP_KERNEL); + req = kmem_cache_zalloc(req_cachep, GFP_KERNEL); VERBOSE("allocating request: %p\n", req); if (!req) goto out; @@ -74,7 +74,6 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server, } } - memset(req, 0, sizeof(struct smb_request)); req->rq_buffer = buf; req->rq_bufsize = bufsize; req->rq_server = server; diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c index cdc53c4fb38..fea20ceb8a5 100644 --- a/fs/smbfs/symlink.c +++ b/fs/smbfs/symlink.c @@ -6,7 +6,6 @@ * Please add a note about your changes to smbfs in the ChangeLog file. */ -#include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fcntl.h> @@ -62,7 +61,7 @@ static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p) __putname(s); } -struct inode_operations smb_link_inode_operations = +const struct inode_operations smb_link_inode_operations = { .readlink = generic_readlink, .follow_link = smb_follow_link, diff --git a/fs/stack.c b/fs/stack.c index 8ffb880d2f4..67716f6a1a4 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -20,11 +20,6 @@ EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, int (*get_nlinks)(struct inode *)) { - if (!get_nlinks) - dest->i_nlink = src->i_nlink; - else - dest->i_nlink = (*get_nlinks)(dest); - dest->i_mode = src->i_mode; dest->i_uid = src->i_uid; dest->i_gid = src->i_gid; @@ -34,5 +29,14 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; + + /* + * Update the nlinks AFTER updating the above fields, because the + * get_links callback may depend on them. + */ + if (!get_nlinks) + dest->i_nlink = src->i_nlink; + else + dest->i_nlink = (*get_nlinks)(dest); } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/super.c b/fs/super.c index 3e7458c2bb7..60b1e50cbf5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -285,7 +285,7 @@ int fsync_super(struct super_block *sb) */ void generic_shutdown_super(struct super_block *sb) { - struct super_operations *sop = sb->s_op; + const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index e8f540d38d4..d3b9f5f07db 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <asm/semaphore.h> #include "sysfs.h" @@ -146,7 +147,7 @@ static int open(struct inode * inode, struct file * file) Error: module_put(attr->attr.owner); Done: - if (error && kobj) + if (error) kobject_put(kobj); return error; } @@ -157,8 +158,7 @@ static int release(struct inode * inode, struct file * file) struct bin_attribute * attr = to_bin_attr(file->f_path.dentry); u8 * buffer = file->private_data; - if (kobj) - kobject_put(kobj); + kobject_put(kobj); module_put(attr->attr.owner); kfree(buffer); return 0; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 511edef8b32..8813990304f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <asm/semaphore.h> #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); @@ -32,25 +33,39 @@ static struct dentry_operations sysfs_dentry_ops = { /* * Allocates a new sysfs_dirent and links it to the parent sysfs_dirent */ -static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, - void * element) +static struct sysfs_dirent * __sysfs_new_dirent(void * element) { struct sysfs_dirent * sd; - sd = kmem_cache_alloc(sysfs_dir_cachep, GFP_KERNEL); + sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; - memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); - list_add(&sd->s_sibling, &parent_sd->s_children); + INIT_LIST_HEAD(&sd->s_sibling); sd->s_element = element; return sd; } +static void __sysfs_list_dirent(struct sysfs_dirent *parent_sd, + struct sysfs_dirent *sd) +{ + if (sd) + list_add(&sd->s_sibling, &parent_sd->s_children); +} + +static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent *parent_sd, + void * element) +{ + struct sysfs_dirent *sd; + sd = __sysfs_new_dirent(element); + __sysfs_list_dirent(parent_sd, sd); + return sd; +} + /* * * Return -EEXIST if there is already a sysfs element with the same name for @@ -77,14 +92,14 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd, } -int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, - void * element, umode_t mode, int type) +static struct sysfs_dirent * +__sysfs_make_dirent(struct dentry *dentry, void *element, mode_t mode, int type) { struct sysfs_dirent * sd; - sd = sysfs_new_dirent(parent_sd, element); + sd = __sysfs_new_dirent(element); if (!sd) - return -ENOMEM; + goto out; sd->s_mode = mode; sd->s_type = type; @@ -94,7 +109,19 @@ int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, dentry->d_op = &sysfs_dentry_ops; } - return 0; +out: + return sd; +} + +int sysfs_make_dirent(struct sysfs_dirent * parent_sd, struct dentry * dentry, + void * element, umode_t mode, int type) +{ + struct sysfs_dirent *sd; + + sd = __sysfs_make_dirent(dentry, element, mode, type); + __sysfs_list_dirent(parent_sd, sd); + + return sd ? 0 : -ENOMEM; } static int init_dir(struct inode * inode) @@ -165,11 +192,11 @@ int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d) /** * sysfs_create_dir - create a directory for an object. - * @parent: parent parent object. * @kobj: object we're creating directory for. + * @shadow_parent: parent parent object. */ -int sysfs_create_dir(struct kobject * kobj) +int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent) { struct dentry * dentry = NULL; struct dentry * parent; @@ -177,7 +204,9 @@ int sysfs_create_dir(struct kobject * kobj) BUG_ON(!kobj); - if (kobj->parent) + if (shadow_parent) + parent = shadow_parent; + else if (kobj->parent) parent = kobj->parent->dentry; else if (sysfs_mount && sysfs_mount->mnt_sb) parent = sysfs_mount->mnt_sb->s_root; @@ -267,7 +296,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); } -struct inode_operations sysfs_dir_inode_operations = { +const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, }; @@ -298,21 +327,12 @@ void sysfs_remove_subdir(struct dentry * d) } -/** - * sysfs_remove_dir - remove an object's directory. - * @kobj: object. - * - * The only thing special about this is that we remove any files in - * the directory before we remove the directory, and we've inlined - * what used to be sysfs_rmdir() below, instead of calling separately. - */ - -void sysfs_remove_dir(struct kobject * kobj) +static void __sysfs_remove_dir(struct dentry *dentry) { - struct dentry * dentry = dget(kobj->dentry); struct sysfs_dirent * parent_sd; struct sysfs_dirent * sd, * tmp; + dget(dentry); if (!dentry) return; @@ -333,32 +353,60 @@ void sysfs_remove_dir(struct kobject * kobj) * Drop reference from dget() on entrance. */ dput(dentry); +} + +/** + * sysfs_remove_dir - remove an object's directory. + * @kobj: object. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ + +void sysfs_remove_dir(struct kobject * kobj) +{ + __sysfs_remove_dir(kobj->dentry); kobj->dentry = NULL; } -int sysfs_rename_dir(struct kobject * kobj, const char *new_name) +int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent, + const char *new_name) { int error = 0; - struct dentry * new_dentry, * parent; - - if (!strcmp(kobject_name(kobj), new_name)) - return -EINVAL; + struct dentry * new_dentry; - if (!kobj->parent) - return -EINVAL; + if (!new_parent) + return -EFAULT; down_write(&sysfs_rename_sem); - parent = kobj->parent->dentry; - - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&new_parent->d_inode->i_mutex); - new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); + new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + /* By allowing two different directories with the + * same d_parent we allow this routine to move + * between different shadows of the same directory + */ + if (kobj->dentry->d_parent->d_inode != new_parent->d_inode) + return -EINVAL; + else if (new_dentry->d_parent->d_inode != new_parent->d_inode) + error = -EINVAL; + else if (new_dentry == kobj->dentry) + error = -EINVAL; + else if (!new_dentry->d_inode) { error = kobject_set_name(kobj, "%s", new_name); if (!error) { + struct sysfs_dirent *sd, *parent_sd; + d_add(new_dentry, NULL); d_move(kobj->dentry, new_dentry); + + sd = kobj->dentry->d_fsdata; + parent_sd = new_parent->d_fsdata; + + list_del_init(&sd->s_sibling); + list_add(&sd->s_sibling, &parent_sd->s_children); } else d_drop(new_dentry); @@ -366,7 +414,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&new_parent->d_inode->i_mutex); up_write(&sysfs_rename_sem); return error; @@ -378,12 +426,10 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent) struct sysfs_dirent *new_parent_sd, *sd; int error; - if (!new_parent) - return -EINVAL; - old_parent_dentry = kobj->parent ? kobj->parent->dentry : sysfs_mount->mnt_sb->s_root; - new_parent_dentry = new_parent->dentry; + new_parent_dentry = new_parent ? + new_parent->dentry : sysfs_mount->mnt_sb->s_root; again: mutex_lock(&old_parent_dentry->d_inode->i_mutex); @@ -547,6 +593,95 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) return offset; } + +/** + * sysfs_make_shadowed_dir - Setup so a directory can be shadowed + * @kobj: object we're creating shadow of. + */ + +int sysfs_make_shadowed_dir(struct kobject *kobj, + void * (*follow_link)(struct dentry *, struct nameidata *)) +{ + struct inode *inode; + struct inode_operations *i_op; + + inode = kobj->dentry->d_inode; + if (inode->i_op != &sysfs_dir_inode_operations) + return -EINVAL; + + i_op = kmalloc(sizeof(*i_op), GFP_KERNEL); + if (!i_op) + return -ENOMEM; + + memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op)); + i_op->follow_link = follow_link; + + /* Locking of inode->i_op? + * Since setting i_op is a single word write and they + * are atomic we should be ok here. + */ + inode->i_op = i_op; + return 0; +} + +/** + * sysfs_create_shadow_dir - create a shadow directory for an object. + * @kobj: object we're creating directory for. + * + * sysfs_make_shadowed_dir must already have been called on this + * directory. + */ + +struct dentry *sysfs_create_shadow_dir(struct kobject *kobj) +{ + struct sysfs_dirent *sd; + struct dentry *parent, *dir, *shadow; + struct inode *inode; + + dir = kobj->dentry; + inode = dir->d_inode; + parent = dir->d_parent; + shadow = ERR_PTR(-EINVAL); + if (!sysfs_is_shadowed_inode(inode)) + goto out; + + shadow = d_alloc(parent, &dir->d_name); + if (!shadow) + goto nomem; + + sd = __sysfs_make_dirent(shadow, kobj, inode->i_mode, SYSFS_DIR); + if (!sd) + goto nomem; + + d_instantiate(shadow, igrab(inode)); + inc_nlink(inode); + inc_nlink(parent->d_inode); + shadow->d_op = &sysfs_dentry_ops; + + dget(shadow); /* Extra count - pin the dentry in core */ + +out: + return shadow; +nomem: + dput(shadow); + shadow = ERR_PTR(-ENOMEM); + goto out; +} + +/** + * sysfs_remove_shadow_dir - remove an object's directory. + * @shadow: dentry of shadow directory + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be sysfs_rmdir() below, instead of calling separately. + */ + +void sysfs_remove_shadow_dir(struct dentry *shadow) +{ + __sysfs_remove_dir(shadow); +} + const struct file_operations sysfs_dir_operations = { .open = sysfs_dir_open, .release = sysfs_dir_close, diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 9cfe53e1e00..98b0910ad80 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -7,6 +7,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/list.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -50,17 +51,29 @@ static struct sysfs_ops subsys_sysfs_ops = { .store = subsys_attr_store, }; +/** + * add_to_collection - add buffer to a collection + * @buffer: buffer to be added + * @node: inode of set to add to + */ -struct sysfs_buffer { - size_t count; - loff_t pos; - char * page; - struct sysfs_ops * ops; - struct semaphore sem; - int needs_read_fill; - int event; -}; +static inline void +add_to_collection(struct sysfs_buffer *buffer, struct inode *node) +{ + struct sysfs_buffer_collection *set = node->i_private; + mutex_lock(&node->i_mutex); + list_add(&buffer->associates, &set->associates); + mutex_unlock(&node->i_mutex); +} + +static inline void +remove_from_collection(struct sysfs_buffer *buffer, struct inode *node) +{ + mutex_lock(&node->i_mutex); + list_del(&buffer->associates); + mutex_unlock(&node->i_mutex); +} /** * fill_read_buffer - allocate and fill buffer from object. @@ -70,7 +83,8 @@ struct sysfs_buffer { * Allocate @buffer->page, if it hasn't been already, then call the * kobject's show() method to fill the buffer with this attribute's * data. - * This is called only once, on the file's first read. + * This is called only once, on the file's first read unless an error + * is returned. */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { @@ -88,12 +102,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); - buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); - if (count >= 0) + if (count >= 0) { + buffer->needs_read_fill = 0; buffer->count = count; - else + } else { ret = count; + } return ret; } @@ -153,6 +168,10 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) ssize_t retval = 0; down(&buffer->sem); + if (buffer->orphaned) { + retval = -ENODEV; + goto out; + } if (buffer->needs_read_fill) { if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) goto out; @@ -165,7 +184,6 @@ out: return retval; } - /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. @@ -243,19 +261,25 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t ssize_t len; down(&buffer->sem); + if (buffer->orphaned) { + len = -ENODEV; + goto out; + } len = fill_write_buffer(buffer, buf, count); if (len > 0) len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; +out: up(&buffer->sem); return len; } -static int check_perm(struct inode * inode, struct file * file) +static int sysfs_open_file(struct inode *inode, struct file *file) { struct kobject *kobj = sysfs_get_kobject(file->f_path.dentry->d_parent); struct attribute * attr = to_attr(file->f_path.dentry); + struct sysfs_buffer_collection *set; struct sysfs_buffer * buffer; struct sysfs_ops * ops = NULL; int error = 0; @@ -285,6 +309,18 @@ static int check_perm(struct inode * inode, struct file * file) if (!ops) goto Eaccess; + /* make sure we have a collection to add our buffers to */ + mutex_lock(&inode->i_mutex); + if (!(set = inode->i_private)) { + if (!(set = inode->i_private = kmalloc(sizeof(struct sysfs_buffer_collection), GFP_KERNEL))) { + error = -ENOMEM; + goto Done; + } else { + INIT_LIST_HEAD(&set->associates); + } + } + mutex_unlock(&inode->i_mutex); + /* File needs write support. * The inode's perms must say it's ok, * and we must have a store method. @@ -310,9 +346,11 @@ static int check_perm(struct inode * inode, struct file * file) */ buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL); if (buffer) { + INIT_LIST_HEAD(&buffer->associates); init_MUTEX(&buffer->sem); buffer->needs_read_fill = 1; buffer->ops = ops; + add_to_collection(buffer, inode); file->private_data = buffer; } else error = -ENOMEM; @@ -325,16 +363,11 @@ static int check_perm(struct inode * inode, struct file * file) error = -EACCES; module_put(attr->owner); Done: - if (error && kobj) + if (error) kobject_put(kobj); return error; } -static int sysfs_open_file(struct inode * inode, struct file * filp) -{ - return check_perm(inode,filp); -} - static int sysfs_release(struct inode * inode, struct file * filp) { struct kobject * kobj = to_kobj(filp->f_path.dentry->d_parent); @@ -342,8 +375,9 @@ static int sysfs_release(struct inode * inode, struct file * filp) struct module * owner = attr->owner; struct sysfs_buffer * buffer = filp->private_data; - if (kobj) - kobject_put(kobj); + if (buffer) + remove_from_collection(buffer, inode); + kobject_put(kobj); /* After this point, attr should not be accessed. */ module_put(owner); @@ -548,7 +582,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->dentry,attr->name); + sysfs_hash_and_remove(kobj->dentry, attr->name); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 122145b0895..b20951c9376 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -13,6 +13,8 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> +#include <linux/fs.h> +#include <asm/semaphore.h> #include "sysfs.h" diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e79e38d52c0..dd1344b007f 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -13,6 +13,7 @@ #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> +#include <asm/semaphore.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -28,10 +29,20 @@ static struct backing_dev_info sysfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -static struct inode_operations sysfs_inode_operations ={ +static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, }; +void sysfs_delete_inode(struct inode *inode) +{ + /* Free the shadowed directory inode operations */ + if (sysfs_is_shadowed_inode(inode)) { + kfree(inode->i_op); + inode->i_op = NULL; + } + return generic_delete_inode(inode); +} + int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; @@ -209,6 +220,22 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) return NULL; } +static inline void orphan_all_buffers(struct inode *node) +{ + struct sysfs_buffer_collection *set = node->i_private; + struct sysfs_buffer *buf; + + mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD); + if (node->i_private) { + list_for_each_entry(buf, &set->associates, associates) { + down(&buf->sem); + buf->orphaned = 1; + up(&buf->sem); + } + } + mutex_unlock(&node->i_mutex); +} + /* * Unhashes the dentry corresponding to given sysfs_dirent @@ -217,16 +244,23 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) { struct dentry * dentry = sd->s_dentry; + struct inode *inode; if (dentry) { spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { + inode = dentry->d_inode; + spin_lock(&inode->i_lock); + __iget(inode); + spin_unlock(&inode->i_lock); dget_locked(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); + orphan_all_buffers(inode); + iput(inode); } else { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); @@ -248,7 +282,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name) return -ENOENT; parent_sd = dir->d_fsdata; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e503f858fba..23a48a38e6a 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -8,6 +8,7 @@ #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <asm/semaphore.h> #include "sysfs.h" @@ -18,9 +19,12 @@ struct vfsmount *sysfs_mount; struct super_block * sysfs_sb = NULL; struct kmem_cache *sysfs_dir_cachep; -static struct super_operations sysfs_ops = { +static void sysfs_clear_inode(struct inode *inode); + +static const struct super_operations sysfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = sysfs_delete_inode, + .clear_inode = sysfs_clear_inode, }; static struct sysfs_dirent sysfs_root = { @@ -31,6 +35,11 @@ static struct sysfs_dirent sysfs_root = { .s_iattr = NULL, }; +static void sysfs_clear_inode(struct inode *inode) +{ + kfree(inode->i_private); +} + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f50e3cc2ded..7b9c5bfde92 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <asm/semaphore.h> #include "sysfs.h" @@ -180,7 +181,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co free_page((unsigned long)page); } -struct inode_operations sysfs_symlink_inode_operations = { +const struct inode_operations sysfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index bd7cec295da..d976b000554 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -2,6 +2,7 @@ extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; +extern void sysfs_delete_inode(struct inode *inode); extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); @@ -25,14 +26,30 @@ extern struct super_block * sysfs_sb; extern const struct file_operations sysfs_dir_operations; extern const struct file_operations sysfs_file_operations; extern const struct file_operations bin_fops; -extern struct inode_operations sysfs_dir_inode_operations; -extern struct inode_operations sysfs_symlink_inode_operations; +extern const struct inode_operations sysfs_dir_inode_operations; +extern const struct inode_operations sysfs_symlink_inode_operations; struct sysfs_symlink { char * link_name; struct kobject * target_kobj; }; +struct sysfs_buffer { + struct list_head associates; + size_t count; + loff_t pos; + char * page; + struct sysfs_ops * ops; + struct semaphore sem; + int orphaned; + int needs_read_fill; + int event; +}; + +struct sysfs_buffer_collection { + struct list_head associates; +}; + static inline struct kobject * to_kobj(struct dentry * dentry) { struct sysfs_dirent * sd = dentry->d_fsdata; @@ -96,3 +113,7 @@ static inline void sysfs_put(struct sysfs_dirent * sd) release_sysfs_dirent(sd); } +static inline int sysfs_is_shadowed_inode(struct inode *inode) +{ + return S_ISDIR(inode->i_mode) && inode->i_op->follow_link; +} diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 47a4b728f15..0732ddb9020 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -30,7 +30,7 @@ const struct file_operations sysv_file_operations = { .sendfile = generic_file_sendfile, }; -struct inode_operations sysv_file_inode_operations = { +const struct inode_operations sysv_file_inode_operations = { .truncate = sysv_truncate, .getattr = sysv_getattr, }; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index ead9864567e..9311cac186f 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -142,7 +142,7 @@ static inline void write3byte(struct sysv_sb_info *sbi, } } -static struct inode_operations sysv_symlink_inode_operations = { +static const struct inode_operations sysv_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, @@ -327,7 +327,7 @@ static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags) inode_init_once(&si->vfs_inode); } -struct super_operations sysv_sops = { +const struct super_operations sysv_sops = { .alloc_inode = sysv_alloc_inode, .destroy_inode = sysv_destroy_inode, .read_inode = sysv_read_inode, diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index f7c08db8e34..4e48abbd2b5 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -292,7 +292,7 @@ out: /* * directories can handle most operations... */ -struct inode_operations sysv_dir_inode_operations = { +const struct inode_operations sysv_dir_inode_operations = { .create = sysv_create, .lookup = sysv_lookup, .link = sysv_link, diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c index b85ce61d635..00d2f8a43e4 100644 --- a/fs/sysv/symlink.c +++ b/fs/sysv/symlink.c @@ -14,7 +14,7 @@ static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations sysv_fast_symlink_inode_operations = { +const struct inode_operations sysv_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = sysv_follow_link, }; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index dcb18b2171f..5b4fedf17cc 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -159,13 +159,13 @@ extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); extern ino_t sysv_inode_by_name(struct dentry *); -extern struct inode_operations sysv_file_inode_operations; -extern struct inode_operations sysv_dir_inode_operations; -extern struct inode_operations sysv_fast_symlink_inode_operations; +extern const struct inode_operations sysv_file_inode_operations; +extern const struct inode_operations sysv_dir_inode_operations; +extern const struct inode_operations sysv_fast_symlink_inode_operations; extern const struct file_operations sysv_file_operations; extern const struct file_operations sysv_dir_operations; extern const struct address_space_operations sysv_aops; -extern struct super_operations sysv_sops; +extern const struct super_operations sysv_sops; extern struct dentry_operations sysv_dentry_operations; diff --git a/fs/udf/file.c b/fs/udf/file.c index d81f2db7b0e..40d5047defe 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -263,6 +263,6 @@ const struct file_operations udf_file_operations = { .sendfile = generic_file_sendfile, }; -struct inode_operations udf_file_inode_operations = { +const struct inode_operations udf_file_inode_operations = { .truncate = udf_truncate, }; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 73163325e5e..fe361cd19a9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1308,7 +1308,7 @@ end_rename: return retval; } -struct inode_operations udf_dir_inode_operations = { +const struct inode_operations udf_dir_inode_operations = { .lookup = udf_lookup, .create = udf_create, .link = udf_link, diff --git a/fs/udf/super.c b/fs/udf/super.c index 1dbc2955f02..8672b88f7ff 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -160,7 +160,7 @@ static void destroy_inodecache(void) } /* Superblock operations */ -static struct super_operations udf_sb_ops = { +static const struct super_operations udf_sb_ops = { .alloc_inode = udf_alloc_inode, .destroy_inode = udf_destroy_inode, .write_inode = udf_write_inode, diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 1033b7cf293..ee1dece1f6f 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -42,9 +42,9 @@ struct task_struct; struct buffer_head; struct super_block; -extern struct inode_operations udf_dir_inode_operations; +extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; -extern struct inode_operations udf_file_inode_operations; +extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_adinicb_aops; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 638f4c585e8..bcc44084e00 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -4,6 +4,8 @@ * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics + * + * UFS2 write support Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ #include <linux/fs.h> @@ -14,45 +16,48 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/capability.h> -#include <linux/sched.h> #include <linux/bitops.h> #include <asm/byteorder.h> #include "swab.h" #include "util.h" -static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *); -static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *); -static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *); -static unsigned ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, unsigned, unsigned); +#define INVBLOCK ((u64)-1L) + +static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *); +static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *); +static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *); +static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned); static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[]; static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int); /* * Free 'count' fragments from fragment number 'fragment' */ -void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count) +void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; - unsigned cgno, bit, end_bit, bbase, blkmap, i, blkno, cylno; + unsigned cgno, bit, end_bit, bbase, blkmap, i; + u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - UFSD("ENTER, fragment %u, count %u\n", fragment, count); + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); lock_super(sb); - cgno = ufs_dtog(fragment); - bit = ufs_dtogd(fragment); + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); if (cgno >= uspi->s_ncg) { ufs_panic (sb, "ufs_free_fragments", "freeing blocks are outside device"); goto failed; @@ -101,9 +106,13 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count) fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - cylno = ufs_cbtocylno (bbase); - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno (bbase); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(bbase)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -127,24 +136,27 @@ failed: /* * Free 'count' fragments from fragment number 'fragment' (free whole blocks) */ -void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count) +void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; - unsigned overflow, cgno, bit, end_bit, blkno, i, cylno; + unsigned overflow, cgno, bit, end_bit, i; + u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - UFSD("ENTER, fragment %u, count %u\n", fragment, count); + UFSD("ENTER, fragment %llu, count %u\n", + (unsigned long long)fragment, count); if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) { ufs_error (sb, "ufs_free_blocks", "internal error, " - "fragment %u, count %u\n", fragment, count); + "fragment %llu, count %u\n", + (unsigned long long)fragment, count); goto failed; } @@ -152,8 +164,8 @@ void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count) do_more: overflow = 0; - cgno = ufs_dtog (fragment); - bit = ufs_dtogd (fragment); + cgno = ufs_dtog(uspi, fragment); + bit = ufs_dtogd(uspi, fragment); if (cgno >= uspi->s_ncg) { ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device"); goto failed_unlock; @@ -187,9 +199,14 @@ do_more: fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - cylno = ufs_cbtocylno(i); - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno(i); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(i)), 1); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -308,15 +325,19 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, } } -unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, - unsigned goal, unsigned count, int * err, struct page *locked_page) +u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, + u64 goal, unsigned count, int *err, + struct page *locked_page) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; - unsigned cgno, oldcount, newcount, tmp, request, result; + unsigned cgno, oldcount, newcount; + u64 tmp, request, result; - UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count); + UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)goal, count); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -324,11 +345,12 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, *err = -ENOSPC; lock_super (sb); - - tmp = fs32_to_cpu(sb, *p); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (count + ufs_fragnum(fragment) > uspi->s_fpb) { - ufs_warning (sb, "ufs_new_fragments", "internal warning" - " fragment %u, count %u", fragment, count); + ufs_warning(sb, "ufs_new_fragments", "internal warning" + " fragment %llu, count %u", + (unsigned long long)fragment, count); count = uspi->s_fpb - ufs_fragnum(fragment); } oldcount = ufs_fragnum (fragment); @@ -339,10 +361,12 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, */ if (oldcount) { if (!tmp) { - ufs_error (sb, "ufs_new_fragments", "internal error, " - "fragment %u, tmp %u\n", fragment, tmp); - unlock_super (sb); - return (unsigned)-1; + ufs_error(sb, "ufs_new_fragments", "internal error, " + "fragment %llu, tmp %llu\n", + (unsigned long long)fragment, + (unsigned long long)tmp); + unlock_super(sb); + return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); @@ -372,7 +396,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, if (goal == 0) cgno = ufs_inotocg (inode->i_ino); else - cgno = ufs_dtog (goal); + cgno = ufs_dtog(uspi, goal); /* * allocate new fragment @@ -380,14 +404,16 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { - *p = cpu_to_fs32(sb, result); + ufs_cpu_to_data_ptr(sb, p, result); *err = 0; - UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); - ufs_clear_frags(inode, result + oldcount, newcount - oldcount, - locked_page != NULL); + UFS_I(inode)->i_lastfrag = + max_t(u32, UFS_I(inode)->i_lastfrag, + fragment + count); + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); } unlock_super(sb); - UFSD("EXIT, result %u\n", result); + UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -401,7 +427,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); unlock_super(sb); - UFSD("EXIT, result %u\n", result); + UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -433,15 +459,14 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, locked_page != NULL); ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp, result, locked_page); - - *p = cpu_to_fs32(sb, result); + ufs_cpu_to_data_ptr(sb, p, result); *err = 0; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); unlock_super(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); - UFSD("EXIT, result %u\n", result); + UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -450,9 +475,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, return 0; } -static unsigned -ufs_add_fragments (struct inode * inode, unsigned fragment, - unsigned oldcount, unsigned newcount, int * err) +static u64 ufs_add_fragments(struct inode *inode, u64 fragment, + unsigned oldcount, unsigned newcount, int *err) { struct super_block * sb; struct ufs_sb_private_info * uspi; @@ -461,14 +485,15 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; - UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount); + UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", + (unsigned long long)fragment, oldcount, newcount); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first (uspi); count = newcount - oldcount; - cgno = ufs_dtog(fragment); + cgno = ufs_dtog(uspi, fragment); if (fs32_to_cpu(sb, UFS_SB(sb)->fs_cs(cgno).cs_nffree) < count) return 0; if ((ufs_fragnum (fragment) + newcount) > uspi->s_fpb) @@ -483,7 +508,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, return 0; } - fragno = ufs_dtogd (fragment); + fragno = ufs_dtogd(uspi, fragment); fragoff = ufs_fragnum (fragno); for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) @@ -521,7 +546,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, } sb->s_dirt = 1; - UFSD("EXIT, fragment %u\n", fragment); + UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); return fragment; } @@ -534,17 +559,19 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, if (fs32_to_cpu(sb, ucg->cg_frsum[k])) \ goto cg_found; -static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno, - unsigned goal, unsigned count, int * err) +static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, + u64 goal, unsigned count, int *err) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; - unsigned oldcg, i, j, k, result, allocsize; + unsigned oldcg, i, j, k, allocsize; + u64 result; - UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count); + UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", + inode->i_ino, cgno, (unsigned long long)goal, count); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -593,7 +620,7 @@ cg_found: if (count == uspi->s_fpb) { result = ufs_alloccg_block (inode, ucpi, goal, err); - if (result == (unsigned)-1) + if (result == INVBLOCK) return 0; goto succed; } @@ -604,9 +631,9 @@ cg_found: if (allocsize == uspi->s_fpb) { result = ufs_alloccg_block (inode, ucpi, goal, err); - if (result == (unsigned)-1) + if (result == INVBLOCK) return 0; - goal = ufs_dtogd (result); + goal = ufs_dtogd(uspi, result); for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; @@ -620,7 +647,7 @@ cg_found: } result = ufs_bitmap_search (sb, ucpi, goal, allocsize); - if (result == (unsigned)-1) + if (result == INVBLOCK) return 0; if(DQUOT_ALLOC_BLOCK(inode, count)) { *err = -EDQUOT; @@ -647,20 +674,21 @@ succed: sb->s_dirt = 1; result += cgno * uspi->s_fpg; - UFSD("EXIT3, result %u\n", result); + UFSD("EXIT3, result %llu\n", (unsigned long long)result); return result; } -static unsigned ufs_alloccg_block (struct inode * inode, - struct ufs_cg_private_info * ucpi, unsigned goal, int * err) +static u64 ufs_alloccg_block(struct inode *inode, + struct ufs_cg_private_info *ucpi, + u64 goal, int *err) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; - unsigned result, cylno, blkno; + u64 result, blkno; - UFSD("ENTER, goal %u\n", goal); + UFSD("ENTER, goal %llu\n", (unsigned long long)goal); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -672,7 +700,7 @@ static unsigned ufs_alloccg_block (struct inode * inode, goto norot; } goal = ufs_blknum (goal); - goal = ufs_dtogd (goal); + goal = ufs_dtogd(uspi, goal); /* * If the requested block is available, use it. @@ -684,8 +712,8 @@ static unsigned ufs_alloccg_block (struct inode * inode, norot: result = ufs_bitmap_search (sb, ucpi, goal, uspi->s_fpb); - if (result == (unsigned)-1) - return (unsigned)-1; + if (result == INVBLOCK) + return INVBLOCK; ucpi->c_rotor = result; gotit: blkno = ufs_fragstoblks(result); @@ -694,17 +722,22 @@ gotit: ufs_clusteracct (sb, ucpi, blkno, -1); if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) { *err = -EDQUOT; - return (unsigned)-1; + return INVBLOCK; } fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree--; fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); - cylno = ufs_cbtocylno(result); - fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1); - fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno((unsigned)result); + + fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos((unsigned)result)), 1); + fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); + } - UFSD("EXIT, result %u\n", result); + UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -744,9 +777,9 @@ static unsigned ubh_scanc(struct ufs_sb_private_info *uspi, * @goal: near which block we want find new one * @count: specified size */ -static unsigned ufs_bitmap_search(struct super_block *sb, - struct ufs_cg_private_info *ucpi, - unsigned goal, unsigned count) +static u64 ufs_bitmap_search(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + u64 goal, unsigned count) { /* * Bit patterns for identifying fragments in the block map @@ -761,16 +794,18 @@ static unsigned ufs_bitmap_search(struct super_block *sb, struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_super_block_first *usb1; struct ufs_cylinder_group *ucg; - unsigned start, length, loc, result; + unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; + u64 result; - UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count); + UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, + (unsigned long long)goal, count); usb1 = ubh_get_usb_first (uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) - start = ufs_dtogd(goal) >> 3; + start = ufs_dtogd(uspi, goal) >> 3; else start = ucpi->c_frotor >> 3; @@ -790,7 +825,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb, " length %u, count %u, freeoff %u\n", ucpi->c_cgx, start, length, count, ucpi->c_freeoff); - return (unsigned)-1; + return INVBLOCK; } start = 0; } @@ -808,7 +843,8 @@ static unsigned ufs_bitmap_search(struct super_block *sb, want = want_arr[count]; for (pos = 0; pos <= uspi->s_fpb - count; pos++) { if ((blockmap & mask) == want) { - UFSD("EXIT, result %u\n", result); + UFSD("EXIT, result %llu\n", + (unsigned long long)result); return result + pos; } mask <<= 1; @@ -819,7 +855,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb, ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx); UFSD("EXIT (FAILED)\n"); - return (unsigned)-1; + return INVBLOCK; } static void ufs_clusteracct(struct super_block * sb, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 433b6f68403..4890ddf1518 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -20,7 +20,6 @@ #include <linux/fs.h> #include <linux/ufs_fs.h> #include <linux/smp_lock.h> -#include <linux/sched.h> #include "swab.h" #include "util.h" @@ -106,12 +105,13 @@ static void ufs_check_page(struct page *page) char *kaddr = page_address(page); unsigned offs, rec_len; unsigned limit = PAGE_CACHE_SIZE; + const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { limit = dir->i_size & ~PAGE_CACHE_MASK; - if (limit & (UFS_SECTOR_SIZE - 1)) + if (limit & chunk_mask) goto Ebadsize; if (!limit) goto out; @@ -126,7 +126,7 @@ static void ufs_check_page(struct page *page) goto Ealign; if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p))) goto Enamelen; - if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1)) + if (((offs + rec_len - 1) ^ offs) & ~chunk_mask) goto Espan; if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * UFS_SB(sb)->s_uspi->s_ncg)) @@ -310,6 +310,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; unsigned reclen = UFS_DIR_REC_LEN(namelen); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; @@ -342,8 +343,8 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) if ((char *)de == dir_end) { /* We hit i_size */ name_len = 0; - rec_len = UFS_SECTOR_SIZE; - de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE); + rec_len = chunk_size; + de->d_reclen = cpu_to_fs16(sb, chunk_size); de->d_ino = 0; goto got_it; } @@ -431,7 +432,7 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = ufs_dir_pages(inode); - unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1); + unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = filp->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -511,7 +512,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, struct super_block *sb = inode->i_sb; struct address_space *mapping = page->mapping; char *kaddr = page_address(page); - unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1); + unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); struct ufs_dir_entry *pde = NULL; struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); @@ -556,6 +557,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) struct super_block * sb = dir->i_sb; struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); + const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; struct ufs_dir_entry * de; char *base; int err; @@ -563,7 +565,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) if (!page) return -ENOMEM; kmap(page); - err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE); + err = mapping->a_ops->prepare_write(NULL, page, 0, chunk_size); if (err) { unlock_page(page); goto fail; @@ -584,11 +586,11 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) ((char *)de + fs16_to_cpu(sb, de->d_reclen)); de->d_ino = cpu_to_fs32(sb, dir->i_ino); ufs_set_de_type(sb, de, dir->i_mode); - de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1)); + de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE); + err = ufs_commit_chunk(page, 0, chunk_size); fail: kunmap(page); page_cache_release(page); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 2ad1259c6ec..b868878009b 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -18,6 +18,9 @@ * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * UFS2 write support added by + * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ #include <linux/fs.h> @@ -126,6 +129,47 @@ void ufs_free_inode (struct inode * inode) } /* + * Nullify new chunk of inodes, + * BSD people also set ui_gen field of inode + * during nullification, but we not care about + * that because of linux ufs do not support NFS + */ +static void ufs2_init_inodes_chunk(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + struct ufs_cylinder_group *ucg) +{ + struct buffer_head *bh; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + sector_t beg = uspi->s_sbbase + + ufs_inotofsba(ucpi->c_cgx * uspi->s_ipg + + fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk)); + sector_t end = beg + uspi->s_fpb; + + UFSD("ENTER cgno %d\n", ucpi->c_cgx); + + for (; beg < end; ++beg) { + bh = sb_getblk(sb, beg); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + + fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); + ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); + if (sb->s_flags & MS_SYNCHRONOUS) { + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer(UCPI_UBH(ucpi)); + } + + UFSD("EXIT\n"); +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -146,6 +190,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode) struct inode * inode; unsigned cg, bit, i, j, start; struct ufs_inode_info *ufsi; + int err = -ENOSPC; UFSD("ENTER\n"); @@ -198,13 +243,15 @@ struct inode * ufs_new_inode(struct inode * dir, int mode) goto cg_found; } } - + goto failed; cg_found: ucpi = ufs_load_cylinder (sb, cg); - if (!ucpi) + if (!ucpi) { + err = -EIO; goto failed; + } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number"); @@ -216,6 +263,7 @@ cg_found: if (!(bit < start)) { ufs_error (sb, "ufs_new_inode", "cylinder group %u corrupted - error in inode bitmap\n", cg); + err = -EIO; goto failed; } } @@ -224,9 +272,18 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); else { ufs_panic (sb, "ufs_new_inode", "internal error"); + err = -EIO; goto failed; } - + + if (uspi->fs_magic == UFS2_MAGIC) { + u32 initediblk = fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_initediblk); + + if (bit + uspi->s_inopb > initediblk && + initediblk < fs32_to_cpu(sb, ucg->cg_u.cg_u2.cg_niblk)) + ufs2_init_inodes_chunk(sb, ucpi, ucg); + } + fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1); uspi->cs_total.cs_nifree--; fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1); @@ -236,7 +293,6 @@ cg_found: uspi->cs_total.cs_ndir++; fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1); } - ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { @@ -245,6 +301,7 @@ cg_found: } sb->s_dirt = 1; + inode->i_ino = cg * uspi->s_ipg + bit; inode->i_mode = mode; inode->i_uid = current->fsuid; if (dir->i_mode & S_ISGID) { @@ -254,39 +311,72 @@ cg_found: } else inode->i_gid = current->fsgid; - inode->i_ino = cg * uspi->s_ipg + bit; inode->i_blocks = 0; + inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; - ufsi->i_gen = 0; ufsi->i_shadow = 0; ufsi->i_osync = 0; ufsi->i_oeftflag = 0; ufsi->i_dir_start_lookup = 0; memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); - insert_inode_hash(inode); mark_inode_dirty(inode); + if (uspi->fs_magic == UFS2_MAGIC) { + struct buffer_head *bh; + struct ufs2_inode *ufs2_inode; + + /* + * setup birth date, we do it here because of there is no sense + * to hold it in struct ufs_inode_info, and lose 64 bit + */ + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", + "unable to read inode %lu\n", + inode->i_ino); + err = -EIO; + goto fail_remove_inode; + } + lock_buffer(bh); + ufs2_inode = (struct ufs2_inode *)bh->b_data; + ufs2_inode += ufs_inotofsbo(inode->i_ino); + ufs2_inode->ui_birthtime.tv_sec = + cpu_to_fs32(sb, CURRENT_TIME_SEC.tv_sec); + ufs2_inode->ui_birthtime.tv_usec = 0; + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (sb->s_flags & MS_SYNCHRONOUS) + sync_dirty_buffer(bh); + brelse(bh); + } + unlock_super (sb); if (DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); - inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - iput(inode); - return ERR_PTR(-EDQUOT); + err = -EDQUOT; + goto fail_without_unlock; } UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; +fail_remove_inode: + unlock_super(sb); +fail_without_unlock: + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); failed: unlock_super (sb); make_bad_inode(inode); iput (inode); - UFSD("EXIT (FAILED)\n"); - return ERR_PTR(-ENOSPC); + UFSD("EXIT (FAILED): err %d\n", err); + return ERR_PTR(err); } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 4295ca91cf8..fb34ad03e22 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -170,7 +170,7 @@ out: * @locked_page - for ufs_new_fragments() */ static struct buffer_head * -ufs_inode_getfrag(struct inode *inode, unsigned int fragment, +ufs_inode_getfrag(struct inode *inode, u64 fragment, sector_t new_fragment, unsigned int required, int *err, long *phys, int *new, struct page *locked_page) { @@ -178,12 +178,12 @@ ufs_inode_getfrag(struct inode *inode, unsigned int fragment, struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * result; - unsigned block, blockoff, lastfrag, lastblock, lastblockoff; - unsigned tmp, goal; - __fs32 * p, * p2; + unsigned blockoff, lastblockoff; + u64 tmp, goal, lastfrag, block, lastblock; + void *p, *p2; - UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, " - "metadata %d\n", inode->i_ino, fragment, + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " + "metadata %d\n", inode->i_ino, (unsigned long long)fragment, (unsigned long long)new_fragment, required, !phys); /* TODO : to be done for write support @@ -193,17 +193,20 @@ ufs_inode_getfrag(struct inode *inode, unsigned int fragment, block = ufs_fragstoblks (fragment); blockoff = ufs_fragnum (fragment); - p = ufsi->i_u1.i_data + block; + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + goal = 0; repeat: - tmp = fs32_to_cpu(sb, *p); + tmp = ufs_data_ptr_to_cpu(sb, p); + lastfrag = ufsi->i_lastfrag; if (tmp && fragment < lastfrag) { if (!phys) { result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == fs32_to_cpu(sb, *p)) { - UFSD("EXIT, result %u\n", tmp + blockoff); + if (tmp == ufs_data_ptr_to_cpu(sb, p)) { + UFSD("EXIT, result %llu\n", + (unsigned long long)tmp + blockoff); return result; } brelse (result); @@ -224,10 +227,11 @@ repeat: * We must reallocate last allocated block */ if (lastblockoff) { - p2 = ufsi->i_u1.i_data + lastblock; - tmp = ufs_new_fragments (inode, p2, lastfrag, - fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, - err, locked_page); + p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); + tmp = ufs_new_fragments(inode, p2, lastfrag, + ufs_data_ptr_to_cpu(sb, p2), + uspi->s_fpb - lastblockoff, + err, locked_page); if (!tmp) { if (lastfrag != ufsi->i_lastfrag) goto repeat; @@ -237,27 +241,31 @@ repeat: lastfrag = ufsi->i_lastfrag; } - tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]); + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, + lastblock)); if (tmp) goal = tmp + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, err, phys != NULL ? locked_page : NULL); - } + } else if (lastblock == block) { /* * We will extend last allocated block */ - else if (lastblock == block) { - tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), - fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), + tmp = ufs_new_fragments(inode, p, fragment - + (blockoff - lastblockoff), + ufs_data_ptr_to_cpu(sb, p), + required + (blockoff - lastblockoff), err, phys != NULL ? locked_page : NULL); } else /* (lastblock > block) */ { /* * We will allocate new block before last allocated block */ if (block) { - tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]); + tmp = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); if (tmp) goal = tmp + uspi->s_fpb; } @@ -266,7 +274,7 @@ repeat: phys != NULL ? locked_page : NULL); } if (!tmp) { - if ((!blockoff && *p) || + if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || (blockoff && lastfrag != ufsi->i_lastfrag)) goto repeat; *err = -ENOSPC; @@ -286,7 +294,7 @@ repeat: if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD("EXIT, result %u\n", tmp + blockoff); + UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); return result; /* This part : To be implemented .... @@ -320,20 +328,22 @@ repeat2: */ static struct buffer_head * ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, - unsigned int fragment, sector_t new_fragment, int *err, + u64 fragment, sector_t new_fragment, int *err, long *phys, int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * result; - unsigned tmp, goal, block, blockoff; - __fs32 * p; + unsigned blockoff; + u64 tmp, goal, block; + void *p; block = ufs_fragstoblks (fragment); blockoff = ufs_fragnum (fragment); - UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n", - inode->i_ino, fragment, (unsigned long long)new_fragment, !phys); + UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", + inode->i_ino, (unsigned long long)fragment, + (unsigned long long)new_fragment, !phys); result = NULL; if (!bh) @@ -344,14 +354,16 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, if (!buffer_uptodate(bh)) goto out; } - - p = (__fs32 *) bh->b_data + block; + if (uspi->fs_magic == UFS2_MAGIC) + p = (__fs64 *)bh->b_data + block; + else + p = (__fs32 *)bh->b_data + block; repeat: - tmp = fs32_to_cpu(sb, *p); + tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) { if (!phys) { result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == fs32_to_cpu(sb, *p)) + if (tmp == ufs_data_ptr_to_cpu(sb, p)) goto out; brelse (result); goto repeat; @@ -361,14 +373,16 @@ repeat: } } - if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]))) + if (block && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, locked_page); if (!tmp) { - if (fs32_to_cpu(sb, *p)) + if (ufs_data_ptr_to_cpu(sb, p)) goto repeat; goto out; } @@ -386,7 +400,7 @@ repeat: sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD("result %u\n", tmp + blockoff); + UFSD("result %llu\n", (unsigned long long)tmp + blockoff); out: brelse (bh); UFSD("EXIT\n"); @@ -616,8 +630,8 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); - ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); @@ -661,8 +675,8 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); + inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); - ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen); /* ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); @@ -731,34 +745,11 @@ bad_inode: make_bad_inode(inode); } -static int ufs_update_inode(struct inode * inode, int do_sync) +static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) { - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct buffer_head * bh; - struct ufs_inode * ufs_inode; - unsigned i; - unsigned flags; - - UFSD("ENTER, ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - flags = UFS_SB(sb)->s_flags; - - if (inode->i_ino < UFS_ROOTINO || - inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); - return -1; - } - - bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); - if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); - return -1; - } - ufs_inode = (struct ufs_inode *) (bh->b_data + ufs_inotofsbo(inode->i_ino) * sizeof(struct ufs_inode)); + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + unsigned i; ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); @@ -775,9 +766,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync) ufs_inode->ui_mtime.tv_usec = 0; ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks); ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); - ufs_inode->ui_gen = cpu_to_fs32(sb, ufsi->i_gen); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); - if ((flags & UFS_UID_MASK) == UFS_UID_EFT) { + if ((UFS_SB(sb)->s_flags & UFS_UID_MASK) == UFS_UID_EFT) { ufs_inode->ui_u3.ui_sun.ui_shadow = cpu_to_fs32(sb, ufsi->i_shadow); ufs_inode->ui_u3.ui_sun.ui_oeftflag = cpu_to_fs32(sb, ufsi->i_oeftflag); } @@ -796,6 +787,78 @@ static int ufs_update_inode(struct inode * inode, int do_sync) if (!inode->i_nlink) memset (ufs_inode, 0, sizeof(struct ufs_inode)); +} + +static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_inode_info *ufsi = UFS_I(inode); + unsigned i; + + UFSD("ENTER\n"); + ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); + ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); + + ufs_inode->ui_uid = cpu_to_fs32(sb, inode->i_uid); + ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid); + + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); + ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atime.tv_usec = 0; + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_usec = 0; + ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtime.tv_usec = 0; + + ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks); + ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); + ufs_inode->ui_gen = cpu_to_fs32(sb, inode->i_generation); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ + ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0]; + } else if (inode->i_blocks) { + for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) + ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.u2_i_data[i]; + } else { + for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) + ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i]; + } + + if (!inode->i_nlink) + memset (ufs_inode, 0, sizeof(struct ufs2_inode)); + UFSD("EXIT\n"); +} + +static int ufs_update_inode(struct inode * inode, int do_sync) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); + return -1; + } + + bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); + return -1; + } + if (uspi->fs_magic == UFS2_MAGIC) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_update_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *) bh->b_data; + + ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } mark_buffer_dirty(bh); if (do_sync) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index e84c0ecf073..a059ccd064e 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -355,7 +355,7 @@ out: return err; } -struct inode_operations ufs_dir_inode_operations = { +const struct inode_operations ufs_dir_inode_operations = { .create = ufs_create, .lookup = ufs_lookup, .link = ufs_link, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8a8e9382ec0..b5a6461ec66 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -61,6 +61,8 @@ * UFS2 (of FreeBSD 5.x) support added by * Niraj Kumar <niraj17@iitbombay.org>, Jan 2004 * + * UFS2 write support added by + * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ @@ -93,14 +95,16 @@ /* * Print contents of ufs_super_block, useful for debugging */ -static void ufs_print_super_stuff(struct super_block *sb, unsigned flags, +static void ufs_print_super_stuff(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_second *usb2, struct ufs_super_block_third *usb3) { + u32 magic = fs32_to_cpu(sb, usb3->fs_magic); + printk("ufs_print_super_stuff\n"); - printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb3->fs_magic)); - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + printk(" magic: 0x%x\n", magic); + if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { printk(" fs_size: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); printk(" fs_dsize: %llu\n", (unsigned long long) @@ -117,6 +121,12 @@ static void ufs_print_super_stuff(struct super_block *sb, unsigned flags, printk(" cs_nbfree(No of free blocks): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); } else { printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); @@ -199,11 +209,11 @@ static void ufs_print_cylinder_stuff(struct super_block *sb, printk("\n"); } #else -# define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/ +# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ # define ufs_print_cylinder_stuff(sb, cg) /**/ #endif /* CONFIG_UFS_DEBUG */ -static struct super_operations ufs_super_ops; +static const struct super_operations ufs_super_ops; static char error_buf[1024]; @@ -422,7 +432,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; - unsigned flags = sbi->s_flags; struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; @@ -446,11 +455,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb) if (i + uspi->s_fpb > blks) size = (blks - i) * uspi->s_fsize; - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - ubh = ubh_bread(sb, - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size); - else - ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); if (!ubh) goto failed; @@ -545,6 +550,7 @@ static void ufs_put_cstotal(struct super_block *sb) cpu_to_fs32(sb, uspi->cs_total.cs_nffree); } ubh_mark_buffer_dirty(USPI_UBH(uspi)); + ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); } @@ -572,7 +578,9 @@ static void ufs_put_super_internal(struct super_block *sb) size = uspi->s_bsize; if (i + uspi->s_fpb > blks) size = (blks - i) * uspi->s_fsize; + ubh = ubh_bread(sb, uspi->s_csaddr + i, size); + ubh_memcpyubh (ubh, space, size); space += size; ubh_mark_buffer_uptodate (ubh, 1); @@ -649,7 +657,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) kmalloc (sizeof(struct ufs_sb_private_info), GFP_KERNEL); if (!uspi) goto failed; - + uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; /* Keep 2Gig file limit. Some UFS variants need to override @@ -674,10 +682,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; - if (!(sb->s_flags & MS_RDONLY)) { - printk(KERN_INFO "ufstype=ufs2 is supported read-only\n"); - sb->s_flags |= MS_RDONLY; - } break; case UFS_MOUNT_UFSTYPE_SUN: @@ -718,6 +722,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP: + /*TODO: check may be we need set special dir block size?*/ UFSD("ufstype=nextstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); @@ -733,6 +738,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: + /*TODO: check may be we need set special dir block size?*/ UFSD("ufstype=nextstep-cd\n"); uspi->s_fsize = block_size = 2048; uspi->s_fmask = ~(2048 - 1); @@ -754,6 +760,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; + uspi->s_dirblksize = 1024; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) @@ -887,7 +894,7 @@ magic_found: } - ufs_print_super_stuff(sb, flags, usb1, usb2, usb3); + ufs_print_super_stuff(sb, usb1, usb2, usb3); /* * Check, if file system was correctly unmounted. @@ -970,7 +977,12 @@ magic_found: uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3); uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave); uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew); - uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); + + if (uspi->fs_magic == UFS2_MAGIC) + uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr); + else + uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); + uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize); uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize); uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak); @@ -1057,7 +1069,6 @@ static void ufs_write_super(struct super_block *sb) unsigned flags; lock_kernel(); - UFSD("ENTER\n"); flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; @@ -1153,7 +1164,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && ufstype != UFS_MOUNT_UFSTYPE_44BSD && - ufstype != UFS_MOUNT_UFSTYPE_SUNx86) { + ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && + ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); return -EINVAL; } @@ -1252,7 +1264,7 @@ static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); #endif -static struct super_operations ufs_super_ops = { +static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, .destroy_inode = ufs_destroy_inode, .read_inode = ufs_read_inode, diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index 337512ed578..d8549f807e8 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -36,7 +36,7 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -struct inode_operations ufs_fast_symlink_inode_operations = { +const struct inode_operations ufs_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ufs_follow_link, }; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 0437b0a6fe9..749581fa772 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -30,8 +30,8 @@ */ /* - * Modified to avoid infinite loop on 2006 by - * Evgeniy Dushistov <dushistov@mail.ru> + * Adoptation to use page cache and UFS2 write support by + * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007 */ #include <linux/errno.h> @@ -63,13 +63,13 @@ #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) -static int ufs_trunc_direct (struct inode * inode) +static int ufs_trunc_direct(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - __fs32 * p; - unsigned frag1, frag2, frag3, frag4, block1, block2; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; unsigned frag_to_free, free_count; unsigned i, tmp; int retry; @@ -91,13 +91,16 @@ static int ufs_trunc_direct (struct inode * inode) if (frag2 > frag3) { frag2 = frag4; frag3 = frag4 = 0; - } - else if (frag2 < frag3) { + } else if (frag2 < frag3) { block1 = ufs_fragstoblks (frag2); block2 = ufs_fragstoblks (frag3); } - UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4); + UFSD("frag1 %llu, frag2 %llu, block1 %llu, block2 %llu, frag3 %llu," + " frag4 %llu\n", + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); if (frag1 >= frag2) goto next1; @@ -105,8 +108,8 @@ static int ufs_trunc_direct (struct inode * inode) /* * Free first free fragments */ - p = ufsi->i_u1.i_data + ufs_fragstoblks (frag1); - tmp = fs32_to_cpu(sb, *p); + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp ) ufs_panic (sb, "ufs_trunc_direct", "internal error"); frag2 -= frag1; @@ -121,12 +124,11 @@ next1: * Free whole blocks */ for (i = block1 ; i < block2; i++) { - p = ufsi->i_u1.i_data + i; - tmp = fs32_to_cpu(sb, *p); + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) continue; - - *p = 0; + ufs_data_ptr_clear(uspi, p); if (free_count == 0) { frag_to_free = tmp; @@ -150,13 +152,12 @@ next1: /* * Free last free fragments */ - p = ufsi->i_u1.i_data + ufs_fragstoblks (frag3); - tmp = fs32_to_cpu(sb, *p); + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp ) ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); - - *p = 0; + ufs_data_ptr_clear(uspi, p); ufs_free_fragments (inode, tmp, frag4); mark_inode_dirty(inode); @@ -167,17 +168,20 @@ next1: } -static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) +static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ind_ubh; - __fs32 * ind; - unsigned indirect_block, i, tmp; - unsigned frag_to_free, free_count; + void *ind; + u64 tmp, indirect_block, i, frag_to_free; + unsigned free_count; int retry; - UFSD("ENTER\n"); + UFSD("ENTER: ino %lu, offset %llu, p: %p\n", + inode->i_ino, (unsigned long long)offset, p); + + BUG_ON(!p); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -186,27 +190,27 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) free_count = 0; retry = 0; - tmp = fs32_to_cpu(sb, *p); + tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) return 0; ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != fs32_to_cpu(sb, *p)) { + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { ubh_brelse (ind_ubh); return 1; } if (!ind_ubh) { - *p = 0; + ufs_data_ptr_clear(uspi, p); return 0; } indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; for (i = indirect_block; i < uspi->s_apb; i++) { - ind = ubh_get_addr32 (ind_ubh, i); - tmp = fs32_to_cpu(sb, *ind); + ind = ubh_get_data_ptr(uspi, ind_ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); if (!tmp) continue; - *ind = 0; + ufs_data_ptr_clear(uspi, ind); ubh_mark_buffer_dirty(ind_ubh); if (free_count == 0) { frag_to_free = tmp; @@ -226,11 +230,12 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) ufs_free_blocks (inode, frag_to_free, free_count); } for (i = 0; i < uspi->s_apb; i++) - if (*ubh_get_addr32(ind_ubh,i)) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, ind_ubh, i))) break; if (i >= uspi->s_apb) { - tmp = fs32_to_cpu(sb, *p); - *p = 0; + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); ufs_free_blocks (inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); @@ -248,13 +253,13 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) return retry; } -static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) +static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_buffer_head * dind_bh; - unsigned i, tmp, dindirect_block; - __fs32 * dind; + struct ufs_buffer_head *dind_bh; + u64 i, tmp, dindirect_block; + void *dind; int retry = 0; UFSD("ENTER\n"); @@ -266,22 +271,22 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; retry = 0; - tmp = fs32_to_cpu(sb, *p); + tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) return 0; dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != fs32_to_cpu(sb, *p)) { + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { ubh_brelse (dind_bh); return 1; } if (!dind_bh) { - *p = 0; + ufs_data_ptr_clear(uspi, p); return 0; } for (i = dindirect_block ; i < uspi->s_apb ; i++) { - dind = ubh_get_addr32 (dind_bh, i); - tmp = fs32_to_cpu(sb, *dind); + dind = ubh_get_data_ptr(uspi, dind_bh, i); + tmp = ufs_data_ptr_to_cpu(sb, dind); if (!tmp) continue; retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); @@ -289,11 +294,12 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) } for (i = 0; i < uspi->s_apb; i++) - if (*ubh_get_addr32 (dind_bh, i)) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, dind_bh, i))) break; if (i >= uspi->s_apb) { - tmp = fs32_to_cpu(sb, *p); - *p = 0; + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); @@ -311,34 +317,33 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) return retry; } -static int ufs_trunc_tindirect (struct inode * inode) +static int ufs_trunc_tindirect(struct inode *inode) { + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; struct ufs_buffer_head * tind_bh; - unsigned tindirect_block, tmp, i; - __fs32 * tind, * p; + u64 tindirect_block, tmp, i; + void *tind, *p; int retry; UFSD("ENTER\n"); - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; retry = 0; tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; - p = ufsi->i_u1.i_data + UFS_TIND_BLOCK; - if (!(tmp = fs32_to_cpu(sb, *p))) + + p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); + if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) return 0; tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (tmp != fs32_to_cpu(sb, *p)) { + if (tmp != ufs_data_ptr_to_cpu(sb, p)) { ubh_brelse (tind_bh); return 1; } if (!tind_bh) { - *p = 0; + ufs_data_ptr_clear(uspi, p); return 0; } @@ -349,11 +354,12 @@ static int ufs_trunc_tindirect (struct inode * inode) ubh_mark_buffer_dirty(tind_bh); } for (i = 0; i < uspi->s_apb; i++) - if (*ubh_get_addr32 (tind_bh, i)) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, tind_bh, i))) break; if (i >= uspi->s_apb) { - tmp = fs32_to_cpu(sb, *p); - *p = 0; + tmp = ufs_data_ptr_to_cpu(sb, p); + ufs_data_ptr_clear(uspi, p); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); @@ -375,7 +381,8 @@ static int ufs_alloc_lastblock(struct inode *inode) int err = 0; struct address_space *mapping = inode->i_mapping; struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; - unsigned lastfrag, i, end; + unsigned i, end; + sector_t lastfrag; struct page *lastpage; struct buffer_head *bh; @@ -430,7 +437,9 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; int retry, err = 0; - UFSD("ENTER\n"); + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)i_size_read(inode), + (unsigned long long)old_i_size); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -450,10 +459,12 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) lock_kernel(); while (1) { retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect (inode, UFS_IND_BLOCK, - (__fs32 *) &ufsi->i_u1.i_data[UFS_IND_BLOCK]); - retry |= ufs_trunc_dindirect (inode, UFS_IND_BLOCK + uspi->s_apb, - (__fs32 *) &ufsi->i_u1.i_data[UFS_DIND_BLOCK]); + retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_IND_BLOCK)); + retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_DIND_BLOCK)); retry |= ufs_trunc_tindirect (inode); if (!retry) break; @@ -502,6 +513,6 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) return inode_setattr(inode, attr); } -struct inode_operations ufs_file_inode_operations = { +const struct inode_operations ufs_file_inode_operations = { .setattr = ufs_setattr, }; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 7dd12bb1d62..06d344839c4 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -305,8 +305,22 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, (((__fs32*)((ubh)->bh[(begin) >> (uspi->s_fshift-2)]->b_data)) + \ ((begin) & ((uspi->s_fsize>>2) - 1))) +#define ubh_get_addr64(ubh,begin) \ + (((__fs64*)((ubh)->bh[(begin) >> (uspi->s_fshift-3)]->b_data)) + \ + ((begin) & ((uspi->s_fsize>>3) - 1))) + #define ubh_get_addr ubh_get_addr8 +static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + u64 blk) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return ubh_get_addr64(ubh, blk); + else + return ubh_get_addr32(ubh, blk); +} + #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) @@ -507,3 +521,46 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, if (fragsize > 0 && fragsize < uspi->s_fpb) fs32_add(sb, &fraglist[fragsize], cnt); } + +static inline void *ufs_get_direct_data_ptr(struct ufs_sb_private_info *uspi, + struct ufs_inode_info *ufsi, + unsigned blk) +{ + BUG_ON(blk > UFS_TIND_BLOCK); + return uspi->fs_magic == UFS2_MAGIC ? + (void *)&ufsi->i_u1.u2_i_data[blk] : + (void *)&ufsi->i_u1.i_data[blk]; +} + +static inline u64 ufs_data_ptr_to_cpu(struct super_block *sb, void *p) +{ + return UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC ? + fs64_to_cpu(sb, *(__fs64 *)p) : + fs32_to_cpu(sb, *(__fs32 *)p); +} + +static inline void ufs_cpu_to_data_ptr(struct super_block *sb, void *p, u64 val) +{ + if (UFS_SB(sb)->s_uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = cpu_to_fs64(sb, val); + else + *(__fs32 *)p = cpu_to_fs32(sb, val); +} + +static inline void ufs_data_ptr_clear(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + *(__fs64 *)p = 0; + else + *(__fs32 *)p = 0; +} + +static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, + void *p) +{ + if (uspi->fs_magic == UFS2_MAGIC) + return *(__fs64 *)p == 0; + else + return *(__fs32 *)p == 0; +} diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index 0afd745a37c..c28add2fbe9 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -996,7 +996,7 @@ error_inode: goto out; } -static struct inode_operations vfat_dir_inode_operations = { +static const struct inode_operations vfat_dir_inode_operations = { .create = vfat_create, .lookup = vfat_lookup, .unlink = vfat_unlink, diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 789a2559bd5..c6ad7c7e3ee 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -6,7 +6,6 @@ */ #include <linux/module.h> -#include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/posix_acl_xattr.h> diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 004baf60061..ed2b16dff91 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -15,7 +15,6 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <linux/sched.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/highmem.h> diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index 32e1ce0f04c..af168a1a98c 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -31,15 +31,13 @@ typedef struct { do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) -#define mraccess(mrp) mraccessf(mrp, 0) -#define mrupdate(mrp) mrupdatef(mrp, 0) -static inline void mraccessf(mrlock_t *mrp, int flags) +static inline void mraccess(mrlock_t *mrp) { down_read(&mrp->mr_lock); } -static inline void mrupdatef(mrlock_t *mrp, int flags) +static inline void mrupdate(mrlock_t *mrp) { down_write(&mrp->mr_lock); mrp->mr_writer = 1; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7b54461695e..143ffc851c9 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -56,8 +56,6 @@ xfs_count_page_state( do { if (buffer_uptodate(bh) && !buffer_mapped(bh)) (*unmapped) = 1; - else if (buffer_unwritten(bh) && !buffer_delay(bh)) - clear_buffer_unwritten(bh); else if (buffer_unwritten(bh)) (*unwritten) = 1; else if (buffer_delay(bh)) @@ -249,7 +247,7 @@ xfs_map_blocks( return -error; } -STATIC inline int +STATIC_INLINE int xfs_iomap_valid( xfs_iomap_t *iomapp, loff_t offset) @@ -1272,7 +1270,6 @@ __xfs_get_blocks( if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); - set_buffer_delay(bh_result); } } @@ -1283,13 +1280,18 @@ __xfs_get_blocks( bh_result->b_bdev = iomap.iomap_target->bt_bdev; /* - * If we previously allocated a block out beyond eof and we are - * now coming back to use it then we will need to flag it as new - * even if it has a disk address. + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. */ if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) + (offset >= i_size_read(inode)) || + (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) set_buffer_new(bh_result); if (iomap.iomap_flags & IOMAP_DELAY) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 4fb01ffdfd1..e2bea6a661f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,13 +34,13 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> -STATIC kmem_zone_t *xfs_buf_zone; -STATIC kmem_shaker_t xfs_buf_shake; +static kmem_zone_t *xfs_buf_zone; +static kmem_shaker_t xfs_buf_shake; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); -STATIC struct workqueue_struct *xfslogd_workqueue; +static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; #ifdef XFS_BUF_TRACE @@ -139,7 +139,7 @@ page_region_mask( return mask; } -STATIC inline void +STATIC_INLINE void set_page_region( struct page *page, size_t offset, @@ -151,7 +151,7 @@ set_page_region( SetPageUptodate(page); } -STATIC inline int +STATIC_INLINE int test_page_region( struct page *page, size_t offset, @@ -171,9 +171,9 @@ typedef struct a_list { struct a_list *next; } a_list_t; -STATIC a_list_t *as_free_head; -STATIC int as_list_len; -STATIC DEFINE_SPINLOCK(as_lock); +static a_list_t *as_free_head; +static int as_list_len; +static DEFINE_SPINLOCK(as_lock); /* * Try to batch vunmaps because they are costly. @@ -1085,7 +1085,7 @@ xfs_buf_iostart( return status; } -STATIC __inline__ int +STATIC_INLINE int _xfs_buf_iolocked( xfs_buf_t *bp) { @@ -1095,7 +1095,7 @@ _xfs_buf_iolocked( return 0; } -STATIC __inline__ void +STATIC_INLINE void _xfs_buf_ioend( xfs_buf_t *bp, int schedule) @@ -1426,8 +1426,8 @@ xfs_free_bufhash( /* * buftarg list for delwrite queue processing */ -STATIC LIST_HEAD(xfs_buftarg_list); -STATIC DEFINE_SPINLOCK(xfs_buftarg_lock); +LIST_HEAD(xfs_buftarg_list); +static DEFINE_SPINLOCK(xfs_buftarg_lock); STATIC void xfs_register_buftarg( @@ -1679,21 +1679,60 @@ xfsbufd_wakeup( return 0; } +/* + * Move as many buffers as specified to the supplied list + * idicating if we skipped any buffers to prevent deadlocks. + */ +STATIC int +xfs_buf_delwri_split( + xfs_buftarg_t *target, + struct list_head *list, + unsigned long age) +{ + xfs_buf_t *bp, *n; + struct list_head *dwq = &target->bt_delwrite_queue; + spinlock_t *dwlk = &target->bt_delwrite_lock; + int skipped = 0; + int force; + + force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); + INIT_LIST_HEAD(list); + spin_lock(dwlk); + list_for_each_entry_safe(bp, n, dwq, b_list) { + XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); + ASSERT(bp->b_flags & XBF_DELWRI); + + if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { + if (!force && + time_before(jiffies, bp->b_queuetime + age)) { + xfs_buf_unlock(bp); + break; + } + + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| + _XBF_RUN_QUEUES); + bp->b_flags |= XBF_WRITE; + list_move_tail(&bp->b_list, list); + } else + skipped++; + } + spin_unlock(dwlk); + + return skipped; + +} + STATIC int xfsbufd( - void *data) + void *data) { - struct list_head tmp; - unsigned long age; - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; - int count; + struct list_head tmp; + xfs_buftarg_t *target = (xfs_buftarg_t *)data; + int count; + xfs_buf_t *bp; current->flags |= PF_MEMALLOC; - INIT_LIST_HEAD(&tmp); do { if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); @@ -1705,37 +1744,17 @@ xfsbufd( schedule_timeout_interruptible( xfs_buf_timer_centisecs * msecs_to_jiffies(10)); - count = 0; - age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { - if (!test_bit(XBT_FORCE_FLUSH, - &target->bt_flags) && - time_before(jiffies, - bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| - _XBF_RUN_QUEUES); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, &tmp); - count++; - } - } - spin_unlock(dwlk); + xfs_buf_delwri_split(target, &tmp, + xfs_buf_age_centisecs * msecs_to_jiffies(10)); + count = 0; while (!list_empty(&tmp)) { bp = list_entry(tmp.next, xfs_buf_t, b_list); ASSERT(target == bp->b_target); list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); + count++; } if (as_list_len > 0) @@ -1743,7 +1762,6 @@ xfsbufd( if (count) blk_run_address_space(target->bt_mapping); - clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); } while (!kthread_should_stop()); return 0; @@ -1756,40 +1774,24 @@ xfsbufd( */ int xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) + xfs_buftarg_t *target, + int wait) { - struct list_head tmp; - xfs_buf_t *bp, *n; - int pincount = 0; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; + struct list_head tmp; + xfs_buf_t *bp, *n; + int pincount = 0; xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); - INIT_LIST_HEAD(&tmp); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - ASSERT(bp->b_target == target); - ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q)); - XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp)); - if (xfs_buf_ispin(bp)) { - pincount++; - continue; - } - - list_move_tail(&bp->b_list, &tmp); - } - spin_unlock(dwlk); + set_bit(XBT_FORCE_FLUSH, &target->bt_flags); + pincount = xfs_buf_delwri_split(target, &tmp, 0); /* * Dropped the delayed write list lock, now walk the temporary list */ list_for_each_entry_safe(bp, n, &tmp, b_list) { - xfs_buf_lock(bp); - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|_XBF_RUN_QUEUES); - bp->b_flags |= XBF_WRITE; + ASSERT(target == bp->b_target); if (wait) bp->b_flags &= ~XBF_ASYNC; else diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 9dd235cb010..9e8ef8fef39 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -69,8 +69,8 @@ typedef enum { } xfs_buf_flags_t; typedef enum { - XBT_FORCE_SLEEP = (0 << 1), - XBT_FORCE_FLUSH = (1 << 1), + XBT_FORCE_SLEEP = 0, + XBT_FORCE_FLUSH = 1, } xfs_buftarg_flags_t; typedef struct xfs_bufhash { diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 5fb75d9151f..e3a5fedac1b 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -24,7 +24,7 @@ #include "xfs_mount.h" #include "xfs_export.h" -STATIC struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; +static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; /* * XFS encodes and decodes the fileid portion of NFS filehandles diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index d26f5cd2ba7..cb51dc96135 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -46,7 +46,7 @@ static struct vm_operations_struct xfs_file_vm_ops; static struct vm_operations_struct xfs_dmapi_file_vm_ops; #endif -STATIC inline ssize_t +STATIC_INLINE ssize_t __xfs_file_read( struct kiocb *iocb, const struct iovec *iov, @@ -84,7 +84,7 @@ xfs_file_aio_read_invis( return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); } -STATIC inline ssize_t +STATIC_INLINE ssize_t __xfs_file_write( struct kiocb *iocb, const struct iovec *iov, diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index f011c9cd0d6..ff5c41ff8d4 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -41,8 +41,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" @@ -355,7 +353,6 @@ STATIC int xfs_readlink_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; @@ -388,7 +385,7 @@ xfs_readlink_by_handle( aiov.iov_len = olen; aiov.iov_base = hreq.ohandle; - auio.uio_iov = &aiov; + auio.uio_iov = (struct kvec *)&aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_segflg = UIO_USERSPACE; @@ -406,7 +403,6 @@ STATIC int xfs_fssetdm_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; @@ -448,7 +444,6 @@ STATIC int xfs_attrlist_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; @@ -569,7 +564,6 @@ STATIC int xfs_attrmulti_by_handle( xfs_mount_t *mp, void __user *arg, - struct file *parfilp, struct inode *parinode) { int error; @@ -689,7 +683,6 @@ xfs_ioc_xattr( STATIC int xfs_ioc_getbmap( bhv_desc_t *bdp, - struct file *filp, int flags, unsigned int cmd, void __user *arg); @@ -788,7 +781,7 @@ xfs_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); + return xfs_ioc_getbmap(bdp, ioflags, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(bdp, arg); @@ -802,16 +795,16 @@ xfs_ioctl( return xfs_open_by_handle(mp, arg, filp, inode); case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, filp, inode); + return xfs_fssetdm_by_handle(mp, arg, inode); case XFS_IOC_READLINK_BY_HANDLE: - return xfs_readlink_by_handle(mp, arg, filp, inode); + return xfs_readlink_by_handle(mp, arg, inode); case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, filp, inode); + return xfs_attrlist_by_handle(mp, arg, inode); case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, filp, inode); + return xfs_attrmulti_by_handle(mp, arg, inode); case XFS_IOC_SWAPEXT: { error = xfs_swapext((struct xfs_swapext __user *)arg); @@ -1095,11 +1088,6 @@ xfs_ioc_fsgeometry( /* * Linux extended inode flags interface. */ -#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */ -#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */ -#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */ -#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ -#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ STATIC unsigned int xfs_merge_ioc_xflags( @@ -1108,23 +1096,23 @@ xfs_merge_ioc_xflags( { unsigned int xflags = start; - if (flags & LINUX_XFLAG_IMMUTABLE) + if (flags & FS_IMMUTABLE_FL) xflags |= XFS_XFLAG_IMMUTABLE; else xflags &= ~XFS_XFLAG_IMMUTABLE; - if (flags & LINUX_XFLAG_APPEND) + if (flags & FS_APPEND_FL) xflags |= XFS_XFLAG_APPEND; else xflags &= ~XFS_XFLAG_APPEND; - if (flags & LINUX_XFLAG_SYNC) + if (flags & FS_SYNC_FL) xflags |= XFS_XFLAG_SYNC; else xflags &= ~XFS_XFLAG_SYNC; - if (flags & LINUX_XFLAG_NOATIME) + if (flags & FS_NOATIME_FL) xflags |= XFS_XFLAG_NOATIME; else xflags &= ~XFS_XFLAG_NOATIME; - if (flags & LINUX_XFLAG_NODUMP) + if (flags & FS_NODUMP_FL) xflags |= XFS_XFLAG_NODUMP; else xflags &= ~XFS_XFLAG_NODUMP; @@ -1139,15 +1127,15 @@ xfs_di2lxflags( unsigned int flags = 0; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= LINUX_XFLAG_IMMUTABLE; + flags |= FS_IMMUTABLE_FL; if (di_flags & XFS_DIFLAG_APPEND) - flags |= LINUX_XFLAG_APPEND; + flags |= FS_APPEND_FL; if (di_flags & XFS_DIFLAG_SYNC) - flags |= LINUX_XFLAG_SYNC; + flags |= FS_SYNC_FL; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= LINUX_XFLAG_NOATIME; + flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= LINUX_XFLAG_NODUMP; + flags |= FS_NODUMP_FL; return flags; } @@ -1247,9 +1235,9 @@ xfs_ioc_xattr( break; } - if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \ - LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \ - LINUX_XFLAG_SYNC)) { + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) { error = -EOPNOTSUPP; break; } @@ -1281,7 +1269,6 @@ xfs_ioc_xattr( STATIC int xfs_ioc_getbmap( bhv_desc_t *bdp, - struct file *filp, int ioflags, unsigned int cmd, void __user *arg) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 3ba814ae3bb..0b5fa124bef 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -250,13 +248,13 @@ xfs_init_security( * * XXX(hch): nfsd is broken, better fix it instead. */ -STATIC inline int +STATIC_INLINE int xfs_has_fs_struct(struct task_struct *task) { return (task->fs != init_task.fs); } -STATIC inline void +STATIC void xfs_cleanup_inode( bhv_vnode_t *dvp, bhv_vnode_t *vp, @@ -815,7 +813,7 @@ xfs_vn_removexattr( } -struct inode_operations xfs_inode_operations = { +const struct inode_operations xfs_inode_operations = { .permission = xfs_vn_permission, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, @@ -826,7 +824,7 @@ struct inode_operations xfs_inode_operations = { .removexattr = xfs_vn_removexattr, }; -struct inode_operations xfs_dir_inode_operations = { +const struct inode_operations xfs_dir_inode_operations = { .create = xfs_vn_create, .lookup = xfs_vn_lookup, .link = xfs_vn_link, @@ -845,7 +843,7 @@ struct inode_operations xfs_dir_inode_operations = { .removexattr = xfs_vn_removexattr, }; -struct inode_operations xfs_symlink_inode_operations = { +const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h index ad6173da567..95a69398fce 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -18,9 +18,9 @@ #ifndef __XFS_IOPS_H__ #define __XFS_IOPS_H__ -extern struct inode_operations xfs_inode_operations; -extern struct inode_operations xfs_dir_inode_operations; -extern struct inode_operations xfs_symlink_inode_operations; +extern const struct inode_operations xfs_inode_operations; +extern const struct inode_operations xfs_dir_inode_operations; +extern const struct inode_operations xfs_symlink_inode_operations; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 2b0e0018738..715adad7dd4 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -109,16 +109,6 @@ #undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #endif -/* - * State flag for unwritten extent buffers. - * - * We need to be able to distinguish between these and delayed - * allocate buffers within XFS. The generic IO path code does - * not need to distinguish - we use the BH_Delay flag for both - * delalloc and these ondisk-uninitialised buffers. - */ -BUFFER_FNS(PrivateStart, unwritten); - #define restricted_chown xfs_params.restrict_chown.val #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 65e79b471d4..ff8d64eba9f 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_inode_item.h" #include "xfs_buf_item.h" @@ -134,13 +132,11 @@ STATIC int xfs_iozero( struct inode *ip, /* inode */ loff_t pos, /* offset in file */ - size_t count, /* size of data to zero */ - loff_t end_size) /* max file size to set */ + size_t count) /* size of data to zero */ { unsigned bytes; struct page *page; struct address_space *mapping; - char *kaddr; int status; mapping = ip->i_mapping; @@ -158,26 +154,21 @@ xfs_iozero( if (!page) break; - kaddr = kmap(page); status = mapping->a_ops->prepare_write(NULL, page, offset, offset + bytes); - if (status) { + if (status) goto unlock; - } - memset((void *) (kaddr + offset), 0, bytes); - flush_dcache_page(page); + memclear_highpage_flush(page, offset, bytes); + status = mapping->a_ops->commit_write(NULL, page, offset, offset + bytes); if (!status) { pos += bytes; count -= bytes; - if (pos > i_size_read(ip)) - i_size_write(ip, pos < end_size ? pos : end_size); } unlock: - kunmap(page); unlock_page(page); page_cache_release(page); if (status) @@ -449,8 +440,8 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct inode *ip, xfs_iocore_t *io, - xfs_fsize_t isize, - xfs_fsize_t end_size) + xfs_fsize_t offset, + xfs_fsize_t isize) { xfs_fileoff_t last_fsb; xfs_mount_t *mp = io->io_mount; @@ -459,7 +450,6 @@ xfs_zero_last_block( int zero_len; int error = 0; xfs_bmbt_irec_t imap; - loff_t loff; ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); @@ -494,9 +484,10 @@ xfs_zero_last_block( */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); - loff = XFS_FSB_TO_B(mp, last_fsb); zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); ASSERT(error >= 0); @@ -519,14 +510,15 @@ xfs_zero_eof( bhv_vnode_t *vp, xfs_iocore_t *io, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - xfs_fsize_t end_size) /* terminal inode size */ + xfs_fsize_t isize) /* current inode size */ { struct inode *ip = vn_to_inode(vp); xfs_fileoff_t start_zero_fsb; xfs_fileoff_t end_zero_fsb; xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; xfs_mount_t *mp = io->io_mount; int nimaps; int error = 0; @@ -540,7 +532,7 @@ xfs_zero_eof( * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, io, isize, end_size); + error = xfs_zero_last_block(ip, io, offset, isize); if (error) { ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); @@ -601,10 +593,13 @@ xfs_zero_eof( */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - error = xfs_iozero(ip, - XFS_FSB_TO_B(mp, start_zero_fsb), - XFS_FSB_TO_B(mp, imap.br_blockcount), - end_size); + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); if (error) { goto out_lock; } @@ -783,8 +778,7 @@ start: */ if (pos > isize) { - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, - isize, pos + count); + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); if (error) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index c77e62efb74..7ac51b1d216 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -83,7 +83,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t, - xfs_fsize_t, xfs_fsize_t); + xfs_fsize_t); extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, const struct iovec *, unsigned int, loff_t *, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index b93265b7c79..1a4103ca593 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -58,10 +56,10 @@ #include <linux/kthread.h> #include <linux/freezer.h> -STATIC struct quotactl_ops xfs_quotactl_operations; -STATIC struct super_operations xfs_super_operations; -STATIC kmem_zone_t *xfs_vnode_zone; -STATIC kmem_zone_t *xfs_ioend_zone; +static struct quotactl_ops xfs_quotactl_operations; +static struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_vnode_zone; +static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; STATIC struct xfs_mount_args * @@ -121,7 +119,7 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } -STATIC __inline__ void +STATIC_INLINE void xfs_set_inodeops( struct inode *inode) { @@ -147,7 +145,7 @@ xfs_set_inodeops( } } -STATIC __inline__ void +STATIC_INLINE void xfs_revalidate_inode( xfs_mount_t *mp, bhv_vnode_t *vp, @@ -553,7 +551,6 @@ vfs_sync_worker( error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \ SYNC_ATTR | SYNC_REFCACHE, NULL); vfsp->vfs_sync_seq++; - wmb(); wake_up(&vfsp->vfs_wait_single_sync_task); } @@ -659,9 +656,17 @@ xfs_fs_sync_super( int error; int flags; - if (unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - flags = SYNC_QUIESCE; - else + if (unlikely(sb->s_frozen == SB_FREEZE_WRITE)) { + /* + * First stage of freeze - no more writers will make progress + * now we are here, so we flush delwri and delalloc buffers + * here, then wait for all I/O to complete. Data is frozen at + * that point. Metadata is not frozen, transactions can still + * occur here so don't bother flushing the buftarg (i.e + * SYNC_QUIESCE) because it'll just get dirty again. + */ + flags = SYNC_FSDATA | SYNC_DELWRI | SYNC_WAIT | SYNC_IOWAIT; + } else flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); error = bhv_vfs_sync(vfsp, flags, NULL); @@ -873,7 +878,7 @@ xfs_fs_get_sb( mnt); } -STATIC struct super_operations xfs_super_operations = { +static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .write_inode = xfs_fs_write_inode, @@ -887,7 +892,7 @@ STATIC struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, }; -STATIC struct quotactl_ops xfs_quotactl_operations = { +static struct quotactl_ops xfs_quotactl_operations = { .quota_sync = xfs_fs_quotasync, .get_xstate = xfs_fs_getxstate, .set_xstate = xfs_fs_setxstate, diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index af246532fbf..cd6eaa44aa2 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -54,102 +54,204 @@ xfs_stats_clear_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC ctl_table xfs_table[] = { - {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max}, - - {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max}, - - {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max}, - - {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.panic_mask.min, &xfs_params.panic_mask.max}, - - {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.error_level.min, &xfs_params.error_level.max}, - - {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max}, - - {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max}, - - {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max}, - - {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max}, - - {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max}, - - {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max}, - - {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max}, - - {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.rotorstep.min, &xfs_params.rotorstep.max}, - - {XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max}, +static ctl_table xfs_table[] = { + { + .ctl_name = XFS_RESTRICT_CHOWN, + .procname = "restrict_chown", + .data = &xfs_params.restrict_chown.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.restrict_chown.min, + .extra2 = &xfs_params.restrict_chown.max + }, + { + .ctl_name = XFS_SGID_INHERIT, + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .ctl_name = XFS_SYMLINK_MODE, + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .ctl_name = XFS_PANIC_MASK, + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + { + .ctl_name = XFS_ERRLEVEL, + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .ctl_name = XFS_SYNCD_TIMER, + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .ctl_name = XFS_INHERIT_SYNC, + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .ctl_name = XFS_INHERIT_NODUMP, + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .ctl_name = XFS_INHERIT_NOATIME, + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .ctl_name = XFS_BUF_TIMER, + .procname = "xfsbufd_centisecs", + .data = &xfs_params.xfs_buf_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.xfs_buf_timer.min, + .extra2 = &xfs_params.xfs_buf_timer.max + }, + { + .ctl_name = XFS_BUF_AGE, + .procname = "age_buffer_centisecs", + .data = &xfs_params.xfs_buf_age.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.xfs_buf_age.min, + .extra2 = &xfs_params.xfs_buf_age.max + }, + { + .ctl_name = XFS_INHERIT_NOSYM, + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .ctl_name = XFS_ROTORSTEP, + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .ctl_name = XFS_INHERIT_NODFRG, + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS - {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, - sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler, - &sysctl_intvec, NULL, - &xfs_params.stats_clear.min, &xfs_params.stats_clear.max}, + { + .ctl_name = XFS_STATS_CLEAR, + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &xfs_stats_clear_proc_handler, + .strategy = &sysctl_intvec, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, #endif /* CONFIG_PROC_FS */ - {0} + {} }; -STATIC ctl_table xfs_dir_table[] = { - {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, - {0} +static ctl_table xfs_dir_table[] = { + { + .ctl_name = FS_XFS, + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} }; -STATIC ctl_table xfs_root_table[] = { - {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, - {0} +static ctl_table xfs_root_table[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} }; void xfs_sysctl_register(void) { - xfs_table_header = register_sysctl_table(xfs_root_table, 1); + xfs_table_header = register_sysctl_table(xfs_root_table); } void diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index da255bdf526..e2c2ce98ab5 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -91,7 +91,7 @@ typedef enum { #define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ -#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */ +#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ #define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index 553fa731ade..ada24baf88d 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -26,7 +26,7 @@ DEFINE_SPINLOCK(vnumber_lock); */ #define NVSYNC 37 #define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) -STATIC wait_queue_head_t vsync[NVSYNC]; +static wait_queue_head_t vsync[NVSYNC]; void vn_init(void) diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 515f5fdea57..b76118cf489 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -489,14 +489,14 @@ static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp) #define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock) #define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s) -static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag) +STATIC_INLINE void vn_flagset(struct bhv_vnode *vp, uint flag) { spin_lock(&vp->v_lock); vp->v_flag |= flag; spin_unlock(&vp->v_lock); } -static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag) +STATIC_INLINE uint vn_flagclr(struct bhv_vnode *vp, uint flag) { uint cleared; diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 3aa77153185..4adaf13aac6 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -484,7 +482,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) { + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { goto error1; } diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 33ad5af386e..ddb61fe22a5 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -399,7 +397,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -STATIC struct xfs_item_ops xfs_dquot_item_ops = { +static struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, @@ -606,7 +604,7 @@ xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) return; } -STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, @@ -628,7 +626,7 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 7c6a3a50379..1de2acdc7f7 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -44,8 +44,6 @@ #include "xfs_bmap.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -64,10 +62,10 @@ uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -STATIC kmem_shaker_t xfs_qm_shaker; +static kmem_shaker_t xfs_qm_shaker; -STATIC cred_t xfs_zerocr; -STATIC xfs_inode_t xfs_zeroino; +static cred_t xfs_zerocr; +static xfs_inode_t xfs_zeroino; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index db8872be8c8..d2cdb8a2aad 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -44,8 +44,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" @@ -384,7 +382,7 @@ xfs_qm_dqrele_null( } -STATIC struct xfs_qmops xfs_qmcore_xfs = { +static struct xfs_qmops xfs_qmcore_xfs = { .xfs_qminit = xfs_qm_newmount, .xfs_qmdone = xfs_qm_unmount_quotadestroy, .xfs_qmmount = xfs_qm_endmount, diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 6f858fb81a3..709f5f545cf 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -43,8 +43,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index ed620c4d159..716f562aa8b 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -46,8 +46,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -134,7 +132,7 @@ xfs_qm_quotactl( break; case Q_XQUOTASYNC: - return (xfs_sync_inodes(mp, SYNC_DELWRI, 0, NULL)); + return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL)); default: break; diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 0242e9666e8..d7491e7b1f3 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -43,8 +43,6 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 4363512d2f9..08bbd3cb87a 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -19,7 +19,7 @@ #include "debug.h" #include "spin.h" -static char message[256]; /* keep it off the stack */ +static char message[1024]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); /* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ @@ -44,13 +44,14 @@ cmn_err(register int level, char *fmt, ...) spin_lock_irqsave(&xfs_err_lock,flags); va_start(ap, fmt); if (*fmt == '!') fp++; - len = vsprintf(message, fp, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); - printk("%s%s", err_level[level], message); + len = vsnprintf(message, sizeof(message), fp, ap); + if (len >= sizeof(message)) + len = sizeof(message) - 1; + if (message[len-1] == '\n') + message[len-1] = 0; + printk("%s%s\n", err_level[level], message); va_end(ap); spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); } @@ -64,11 +65,13 @@ icmn_err(register int level, char *fmt, va_list ap) if(level > XFS_MAX_ERR_LEVEL) level = XFS_MAX_ERR_LEVEL; spin_lock_irqsave(&xfs_err_lock,flags); - len = vsprintf(message, fmt, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); + len = vsnprintf(message, sizeof(message), fmt, ap); + if (len >= sizeof(message)) + len = sizeof(message) - 1; + if (message[len-1] == '\n') + message[len-1] = 0; + printk("%s%s\n", err_level[level], message); spin_unlock_irqrestore(&xfs_err_lock,flags); - printk("%s%s", err_level[level], message); BUG_ON(level == CE_PANIC); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 4f54dca662a..2a70cc605ae 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -38,13 +38,37 @@ extern void assfail(char *expr, char *f, int l); #ifndef DEBUG # define ASSERT(expr) ((void)0) -#else + +#ifndef STATIC +# define STATIC static noinline +#endif + +#ifndef STATIC_INLINE +# define STATIC_INLINE static inline +#endif + +#else /* DEBUG */ + # define ASSERT(expr) ASSERT_ALWAYS(expr) extern unsigned long random(void); -#endif #ifndef STATIC -# define STATIC static +# define STATIC noinline #endif +/* + * We stop inlining of inline functions in debug mode. + * Unfortunately, this means static inline in header files + * get multiple definitions, so they need to remain static. + * This then gives tonnes of warnings about unused but defined + * functions, so we need to add the unused attribute to prevent + * these spurious warnings. + */ +#ifndef STATIC_INLINE +# define STATIC_INLINE static __attribute__ ((unused)) noinline +#endif + +#endif /* DEBUG */ + + #endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h index 977879c24ff..324e413dead 100644 --- a/fs/xfs/support/move.h +++ b/fs/xfs/support/move.h @@ -55,7 +55,7 @@ enum uio_seg { }; struct uio { - struct iovec *uio_iov; /* pointer to array of iovecs */ + struct kvec *uio_iov; /* pointer to array of iovecs */ int uio_iovcnt; /* number of iovecs in array */ xfs_off_t uio_offset; /* offset in file this uio corresponds to */ int uio_resid; /* residual i/o count */ @@ -63,7 +63,7 @@ struct uio { }; typedef struct uio uio_t; -typedef struct iovec iovec_t; +typedef struct kvec iovec_t; extern int xfs_uio_read (caddr_t, size_t, uio_t *); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4b0cb474be4..4ca4beb7bb5 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -31,7 +31,6 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_acl.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include <linux/capability.h> diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index bce81c7a4fd..5bd1a2c8bd0 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -58,7 +58,6 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t; /* * Real block structures have a size equal to the disk block size. */ -#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0]) #define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0]) @@ -87,16 +86,13 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t; * Record, key, and pointer address macros for btree blocks. */ #define XFS_ALLOC_REC_ADDR(bb,i,cur) \ - XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(0, cur)) + XFS_BTREE_REC_ADDR(xfs_alloc, bb, i) #define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ - XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) + XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i) #define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ - XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) + XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) /* * Decrement cursor by one record at the level. diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 9ada7bdbae5..9d358ffce4e 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -57,9 +57,9 @@ */ #define ATTR_SYSCOUNT 2 -STATIC struct attrnames posix_acl_access; -STATIC struct attrnames posix_acl_default; -STATIC struct attrnames *attr_system_names[ATTR_SYSCOUNT]; +static struct attrnames posix_acl_access; +static struct attrnames posix_acl_default; +static struct attrnames *attr_system_names[ATTR_SYSCOUNT]; /*======================================================================== * Function prototypes for the kernel. @@ -199,18 +199,14 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, return (error); /* - * Determine space new attribute will use, and if it would be - * "local" or "remote" (note: local != inline). - */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, &local); - - /* * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ if (XFS_IFORK_Q(dp) == 0) { - if ((error = xfs_bmap_add_attrfork(dp, size, rsvd))) + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(namelen, valuelen); + + if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) return(error); } @@ -231,6 +227,13 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, args.addname = 1; args.oknoent = 1; + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(namelen, valuelen, + mp->m_sb.sb_blocksize, &local); + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); if (local) { if (size > (mp->m_sb.sb_blocksize >> 1)) { @@ -346,7 +349,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, error = xfs_attr_shortform_to_leaf(&args); if (!error) { error = xfs_bmap_finish(&args.trans, args.flist, - *args.firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -973,7 +976,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1074,7 +1077,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1152,7 +1154,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1307,7 +1309,6 @@ restart: if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1347,7 +1348,7 @@ restart: error = xfs_da_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1459,7 +1460,6 @@ restart: if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1594,7 +1594,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1646,7 +1646,6 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -2090,7 +2089,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) args->flist, NULL); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -2246,7 +2245,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) NULL, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -2477,7 +2476,7 @@ posix_acl_default_exists( return xfs_acl_vhasacl_default(vp); } -STATIC struct attrnames posix_acl_access = { +static struct attrnames posix_acl_access = { .attr_name = "posix_acl_access", .attr_namelen = sizeof("posix_acl_access") - 1, .attr_get = posix_acl_access_get, @@ -2486,7 +2485,7 @@ STATIC struct attrnames posix_acl_access = { .attr_exists = posix_acl_access_exists, }; -STATIC struct attrnames posix_acl_default = { +static struct attrnames posix_acl_default = { .attr_name = "posix_acl_default", .attr_namelen = sizeof("posix_acl_default") - 1, .attr_get = posix_acl_default_get, @@ -2495,7 +2494,7 @@ STATIC struct attrnames posix_acl_default = { .attr_exists = posix_acl_default_exists, }; -STATIC struct attrnames *attr_system_names[] = +static struct attrnames *attr_system_names[] = { &posix_acl_access, &posix_acl_default }; diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 9719bbef122..8eab73e8340 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -94,7 +94,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); * Namespace helper routines *========================================================================*/ -STATIC inline attrnames_t * +STATIC_INLINE attrnames_t * xfs_attr_flags_namesp(int flags) { return ((flags & XFS_ATTR_SECURE) ? &attr_secure: @@ -105,7 +105,7 @@ xfs_attr_flags_namesp(int flags) * If namespace bits don't match return 0. * If all match then return 1. */ -STATIC inline int +STATIC_INLINE int xfs_attr_namesp_match(int arg_flags, int ondisk_flags) { return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); @@ -116,7 +116,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) * then return 0. * If all match or are overridable then return 1. */ -STATIC inline int +STATIC_INLINE int xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags) { if (((arg_flags & ATTR_SECURE) == 0) != @@ -150,6 +150,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; xfs_mount_t *mp = dp->i_mount; offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ @@ -169,8 +170,43 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; } - /* data fork btree root can have at least this many key/ptr pairs */ - minforkoff = MAX(dp->i_df.if_bytes, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + + case XFS_DINODE_FMT_BTREE: + /* + * If have data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + else + return dp->i_d.di_forkoff; + } else + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + break; + } + + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ @@ -336,7 +372,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) */ totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && - (mp->m_flags & XFS_MOUNT_ATTR2)) { + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { /* * Last attribute now removed, revert to original * inode format making all literal area available @@ -355,7 +392,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || - !(mp->m_flags & XFS_MOUNT_ATTR2)); + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); dp->i_afp->if_ext_max = XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); dp->i_df.if_ext_max = @@ -748,6 +786,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return(-1); return(xfs_attr_shortform_bytesfit(dp, bytes)); @@ -786,6 +825,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); /* * Last attribute was removed, revert to original diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 43be6a7e47c..1afe07f67e3 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -29,7 +29,7 @@ /* * Index of high bit number in byte, -1 for none set, 0..7 otherwise. */ -STATIC const char xfs_highbit[256] = { +static const char xfs_highbit[256] = { -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 498ad50d1f4..87795188ced 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -185,16 +185,6 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork); /* data or attr fork */ -#ifdef DEBUG -/* - * Check that the extents list for the inode ip is in the right order. - */ -STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork); /* data or attr fork */ -#endif - /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -410,7 +400,6 @@ xfs_bmap_count_leaves( STATIC int xfs_bmap_disk_count_leaves( xfs_ifork_t *ifp, - xfs_mount_t *mp, xfs_extnum_t idx, xfs_bmbt_block_t *block, int numrecs, @@ -684,7 +673,7 @@ xfs_bmap_add_extent( ASSERT(nblks <= da_old); if (nblks < da_old) xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(da_old - nblks), rsvd); + (int64_t)(da_old - nblks), rsvd); } /* * Clear out the allocated field, done with it now in any case. @@ -1209,7 +1198,7 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) { + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { /* * Ick gross gag me with a spoon. */ @@ -1220,7 +1209,7 @@ xfs_bmap_add_extent_delay_real( diff--; if (!diff || !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) + XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) break; } if (temp2) { @@ -1228,7 +1217,7 @@ xfs_bmap_add_extent_delay_real( diff--; if (!diff || !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) + XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) break; } } @@ -2015,7 +2004,7 @@ xfs_bmap_add_extent_hole_delay( if (oldlen != newlen) { ASSERT(oldlen > newlen); xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(oldlen - newlen), rsvd); + (int64_t)(oldlen - newlen), rsvd); /* * Nothing to do for disk quota accounting here. */ @@ -3359,7 +3348,7 @@ xfs_bmap_del_extent( */ ASSERT(da_old >= da_new); if (da_old > da_new) - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new), + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), rsvd); if (delta) { /* DELTA: report the original extent. */ @@ -3543,6 +3532,7 @@ xfs_bmap_forkoff_reset( if (whichfork == XFS_ATTR_FORK && (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { ip->i_d.di_forkoff = mp->m_attroffset >> 3; ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / @@ -4079,7 +4069,7 @@ xfs_bmap_add_attrfork( } else XFS_SB_UNLOCK(mp, s); } - if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) + if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL); ASSERT(ip->i_df.if_ext_max == @@ -4212,7 +4202,6 @@ int /* error */ xfs_bmap_finish( xfs_trans_t **tp, /* transaction pointer addr */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled ag for allocs */ int *committed) /* xact committed or not */ { xfs_efd_log_item_t *efd; /* extent free data */ @@ -4533,8 +4522,7 @@ xfs_bmap_read_extents( error0); if (level == 0) break; - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); @@ -4577,8 +4565,7 @@ xfs_bmap_read_extents( /* * Copy records into the extent records. */ - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); + frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); start = i; for (j = 0; j < num_recs; j++, i++, frp++) { trp = xfs_iext_get_ext(ifp, i); @@ -4929,28 +4916,28 @@ xfs_bmapi( if (rt) { error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -(extsz), (flags & + -((int64_t)extsz), (flags & XFS_BMAPI_RSVBLOCKS)); } else { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - -(alen), (flags & + -((int64_t)alen), (flags & XFS_BMAPI_RSVBLOCKS)); } if (!error) { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - -(indlen), (flags & + -((int64_t)indlen), (flags & XFS_BMAPI_RSVBLOCKS)); if (error && rt) xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - extsz, (flags & + (int64_t)extsz, (flags & XFS_BMAPI_RSVBLOCKS)); else if (error) xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - alen, (flags & + (int64_t)alen, (flags & XFS_BMAPI_RSVBLOCKS)); } @@ -5616,13 +5603,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int)rtexts, rsvd); + (int64_t)rtexts, rsvd); (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - (int)del.br_blockcount, rsvd); + (int64_t)del.br_blockcount, rsvd); (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -6048,32 +6035,6 @@ xfs_bmap_eof( } #ifdef DEBUG -/* - * Check that the extents list for the inode ip is in the right order. - */ -STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* current extent entry */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extents in list */ - xfs_bmbt_rec_t *nextp; /* next extent entry */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ep = xfs_iext_get_ext(ifp, 0); - for (idx = 0; idx < nextents - 1; idx++) { - nextp = xfs_iext_get_ext(ifp, idx + 1); - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - ep = nextp; - } -} - STATIC xfs_buf_t * xfs_bmap_get_bp( @@ -6156,8 +6117,7 @@ xfs_check_block( if (root) { keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); } else { - keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); + keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i); } if (prevp) { @@ -6172,15 +6132,14 @@ xfs_check_block( if (root) { pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); } else { - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr); } for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) { thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); } else { - thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, j, dmxr); + thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j, + dmxr); } if (*thispa == *pp) { cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", @@ -6267,8 +6226,7 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { @@ -6305,11 +6263,9 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); + ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); for (j = 1; j < num_recs; j++) { - nextp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, j + 1, mp->m_bmap_dmxr[0]); + nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); if (lastp) { xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)lastp, (void *)ep); @@ -6454,8 +6410,7 @@ xfs_bmap_count_tree( } /* Dive to the next level */ - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (unlikely((error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { @@ -6470,7 +6425,7 @@ xfs_bmap_count_tree( for (;;) { nextbno = be64_to_cpu(block->bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); - if (unlikely(xfs_bmap_disk_count_leaves(ifp, mp, + if (unlikely(xfs_bmap_disk_count_leaves(ifp, 0, block, numrecs, count) < 0)) { xfs_trans_brelse(tp, bp); XFS_ERROR_REPORT("xfs_bmap_count_tree(2)", @@ -6518,7 +6473,6 @@ xfs_bmap_count_leaves( int xfs_bmap_disk_count_leaves( xfs_ifork_t *ifp, - xfs_mount_t *mp, xfs_extnum_t idx, xfs_bmbt_block_t *block, int numrecs, @@ -6528,8 +6482,7 @@ xfs_bmap_disk_count_leaves( xfs_bmbt_rec_t *frp; for (b = 1; b <= numrecs; b++) { - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, idx + b, mp->m_bmap_dmxr[0]); + frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); *count += xfs_bmbt_disk_get_blockcount(frp); } return 0; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 80e93409b78..4f24c7e39b3 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -202,7 +202,6 @@ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled a.g. for allocs */ int *committed); /* xact committed or not */ /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index a7b835bf870..0bf192fea3e 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -678,47 +678,6 @@ error0: return error; } -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -int -xfs_bmbt_get_rec( - xfs_btree_cur_t *cur, - xfs_fileoff_t *off, - xfs_fsblock_t *bno, - xfs_filblks_t *len, - xfs_exntst_t *state, - int *stat) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; -#ifdef DEBUG - int error; -#endif - int ptr; - xfs_bmbt_rec_t *rp; - - block = xfs_bmbt_get_block(cur, 0, &bp); - ptr = cur->bc_ptrs[0]; -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) - return error; -#endif - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - *off = xfs_bmbt_disk_get_startoff(rp); - *bno = xfs_bmbt_disk_get_startblock(rp); - *len = xfs_bmbt_disk_get_blockcount(rp); - *state = xfs_bmbt_disk_get_state(rp); - *stat = 1; - return 0; -} -#endif - /* * Insert one record/level. Return information to the caller * allowing the next level up to proceed if necessary. @@ -1731,9 +1690,9 @@ xfs_bmdr_to_bmbt( rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO); rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); @@ -1862,7 +1821,7 @@ xfs_bmbt_delete( * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. */ -STATIC __inline__ void +STATIC_INLINE void __xfs_bmbt_get_all( __uint64_t l0, __uint64_t l1, @@ -2016,30 +1975,6 @@ xfs_bmbt_disk_get_blockcount( } /* - * Extract the startblock field from an on disk bmap extent record. - */ -xfs_fsblock_t -xfs_bmbt_disk_get_startblock( - xfs_bmbt_rec_t *r) -{ -#if XFS_BIG_BLKNOS - return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#else -#ifdef DEBUG - xfs_dfsbno_t b; - - b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); - return (xfs_fsblock_t)b; -#else /* !DEBUG */ - return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#endif /* DEBUG */ -#endif /* XFS_BIG_BLKNOS */ -} - -/* * Extract the startoff field from a disk format bmap extent record. */ xfs_fileoff_t @@ -2049,17 +1984,6 @@ xfs_bmbt_disk_get_startoff( return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } - -xfs_exntst_t -xfs_bmbt_disk_get_state( - xfs_bmbt_rec_t *r) -{ - int ext_flag; - - ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN)); - return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r), - ext_flag); -} #endif /* XFS_NATIVE_HOST */ @@ -2684,9 +2608,9 @@ xfs_bmbt_to_bmdr( dblock->bb_numrecs = rblock->bb_numrecs; dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); - tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 49539de9525..a77b1b753d0 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -175,19 +175,11 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; #define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) -#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) #define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ (cur)->bc_private.b.whichfork)->if_broot_bytes) -#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_DSIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) -#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_ISIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) - #define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ (((lev) == (cur)->bc_nlevels - 1 ? \ XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ @@ -210,37 +202,21 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; xfs_bmbt, (lev) == 0) : \ ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) -#define XFS_BMAP_REC_DADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_REC_IADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) +#define XFS_BMAP_REC_DADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) + +#define XFS_BMAP_REC_IADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) #define XFS_BMAP_KEY_DADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) + #define XFS_BMAP_KEY_IADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) #define XFS_BMAP_PTR_DADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ be16_to_cpu((bb)->bb_level), cur))) #define XFS_BMAP_PTR_IADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ be16_to_cpu((bb)->bb_level), cur))) /* @@ -248,11 +224,11 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; * we don't have a cursor. */ #define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ - (XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i)) #define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ - (XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i)) #define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ - (XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) + (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) #define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs) #define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) @@ -315,15 +291,11 @@ extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_t *r); #ifndef XFS_NATIVE_HOST extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern xfs_exntst_t xfs_bmbt_disk_get_state(xfs_bmbt_rec_t *r); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fsblock_t xfs_bmbt_disk_get_startblock(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); #else #define xfs_bmbt_disk_get_all(r, s) xfs_bmbt_get_all(r, s) -#define xfs_bmbt_disk_get_state(r) xfs_bmbt_get_state(r) #define xfs_bmbt_disk_get_blockcount(r) xfs_bmbt_get_blockcount(r) -#define xfs_bmbt_disk_get_startblock(r) xfs_bmbt_get_blockcount(r) #define xfs_bmbt_disk_get_startoff(r) xfs_bmbt_get_startoff(r) #endif /* XFS_NATIVE_HOST */ @@ -364,15 +336,6 @@ extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t, xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t); -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -extern int xfs_bmbt_get_rec(struct xfs_btree_cur *, xfs_fileoff_t *, - xfs_fsblock_t *, xfs_filblks_t *, - xfs_exntst_t *, int *); -#endif - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 892b06c5426..4e27d55a1e7 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -122,13 +122,13 @@ extern const __uint32_t xfs_magics[]; * Given block size, type prefix, block pointer, and index of requested entry * (first entry numbered 1). */ -#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_REC_ADDR(t,bb,i) \ ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ ((i) - 1) * sizeof(t ## _rec_t))) -#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_KEY_ADDR(t,bb,i) \ ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ ((i) - 1) * sizeof(t ## _key_t))) -#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \ +#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr) \ ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7a55c248ea7..6c1bddc04e3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -660,7 +660,7 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) /* * This is the ops vector shared by all buf log items. */ -STATIC struct xfs_item_ops xfs_buf_item_ops = { +static struct xfs_item_ops xfs_buf_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 07c708c2b52..d7e13614306 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -21,23 +21,7 @@ /* * This is the structure used to lay out a buf log item in the * log. The data map describes which 128 byte chunks of the buffer - * have been logged. This structure works only on buffers that - * reside up to the first TB in the filesystem. These buffers are - * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF. - */ -typedef struct xfs_buf_log_format_v1 { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - __int32_t blf_blkno; /* starting blkno of this buf */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_v1_t; - -/* - * This is a form of the above structure with a 64 bit blkno field. + * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ typedef struct xfs_buf_log_format_t { diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h deleted file mode 100644 index 7a0e482dd43..00000000000 --- a/fs/xfs/xfs_cap.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CAP_H__ -#define __XFS_CAP_H__ - -/* - * Capabilities - */ -typedef __uint64_t xfs_cap_value_t; - -typedef struct xfs_cap_set { - xfs_cap_value_t cap_effective; /* use in capability checks */ - xfs_cap_value_t cap_permitted; /* combined with file attrs */ - xfs_cap_value_t cap_inheritable;/* pass through exec */ -} xfs_cap_set_t; - -/* On-disk XFS extended attribute names */ -#define SGI_CAP_FILE "SGI_CAP_FILE" -#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1) -#define SGI_CAP_LINUX "SGI_CAP_LINUX" -#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1) - -/* - * For Linux, we take the bitfields directly from capability.h - * and no longer attempt to keep this attribute ondisk compatible - * with IRIX. Since this attribute is only set on executables, - * it just doesn't make much sense to try. We do use a different - * named attribute though, to avoid confusion. - */ - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_CAP - -#include <linux/posix_cap_xattr.h> - -struct bhv_vnode; - -extern int xfs_cap_vhascap(struct bhv_vnode *); -extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vremove(struct bhv_vnode *); - -#define _CAP_EXISTS xfs_cap_vhascap - -#else -#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vremove(v) (-EOPNOTSUPP) -#define _CAP_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_CAP_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index a68bc1f1a31..aea37df4aa6 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1090,8 +1090,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) if (blk->magic == XFS_DA_NODE_MAGIC) { node = blk->bp->data; max = be16_to_cpu(node->hdr.count); - btreehashval = node->btree[max-1].hashval; - blk->hashval = be32_to_cpu(btreehashval); + blk->hashval = be32_to_cpu(node->btree[max-1].hashval); /* * Binary search. (note: small blocks will skip loop) @@ -2166,21 +2165,6 @@ xfs_da_reada_buf( return rval; } -/* - * Calculate the number of bits needed to hold i different values. - */ -uint -xfs_da_log2_roundup(uint i) -{ - uint rval; - - for (rval = 0; rval < NBBY * sizeof(i); rval++) { - if ((1 << rval) >= i) - break; - } - return(rval); -} - kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 4ab865ec8b8..44dabf02f2a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -249,7 +249,6 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_dabuf_t *dead_buf); uint xfs_da_hashname(const uchar_t *name_string, int name_length); -uint xfs_da_log2_roundup(uint i); xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 50d0faea371..b847e6a7a3f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -41,7 +41,6 @@ #include "xfs_itable.h" #include "xfs_dfrag.h" #include "xfs_error.h" -#include "xfs_mac.h" #include "xfs_rw.h" /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b95681b03d8..b1af54464f0 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -132,32 +132,6 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) } int -xfs_errortag_clear(int error_tag, xfs_mount_t *mp) -{ - int i; - int64_t fsid; - - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - xfs_etest[i] = 0; - xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); - xfs_etest_fsname[i] = NULL; - cmn_err(CE_WARN, "Cleared XFS error tag #%d", - error_tag); - return 0; - } - } - - cmn_err(CE_WARN, "XFS error tag %d not on", error_tag); - - return 1; -} - -int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) { int i; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0893e16b7d8..5599ada456a 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -144,7 +144,6 @@ extern void xfs_error_test_init(void); #endif /* __ANSI_CPP__ */ extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clear(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp); extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); #else @@ -180,6 +179,6 @@ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) + ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6dba78199fa..3b14427ee12 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -227,7 +227,7 @@ xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efi log items. */ -STATIC struct xfs_item_ops xfs_efi_item_ops = { +static struct xfs_item_ops xfs_efi_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, @@ -525,7 +525,7 @@ xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efd log items. */ -STATIC struct xfs_item_ops xfs_efd_item_ops = { +static struct xfs_item_ops xfs_efd_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c064e72ada9..32c37c1c47a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -250,8 +250,7 @@ xfs_growfs_data_private( block->bb_numrecs = cpu_to_be16(1); block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -272,8 +271,7 @@ xfs_growfs_data_private( block->bb_numrecs = cpu_to_be16(1); block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -460,7 +458,7 @@ xfs_fs_counts( { unsigned long s; - xfs_icsb_sync_counters_lazy(mp); + xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT); s = XFS_SB_LOCK(mp); cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; @@ -491,7 +489,7 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta; + __int64_t lcounter, delta, fdblks_delta; __uint64_t request; unsigned long s; @@ -504,17 +502,35 @@ xfs_reserve_blocks( } request = *inval; + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the XFS_SB_LOCK so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + * + * We also use the xfs_mod_incore_sb() interface so that we + * don't have to care about whether per cpu counter are + * enabled, disabled or even compiled in.... + */ +retry: s = XFS_SB_LOCK(mp); + xfs_icsb_sync_counters_flags(mp, XFS_ICSB_SB_LOCKED); /* * If our previous reservation was larger than the current value, * then move any unused blocks back to the free pool. */ - + fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ - mp->m_sb.sb_fdblocks += lcounter; + fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; @@ -522,24 +538,50 @@ xfs_reserve_blocks( __int64_t free; free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + delta = request - mp->m_resblks; lcounter = free - delta; if (lcounter < 0) { /* We can't satisfy the request, just get what we can */ mp->m_resblks += free; mp->m_resblks_avail += free; + fdblks_delta = -free; mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp); } else { + fdblks_delta = -delta; mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); mp->m_resblks = request; mp->m_resblks_avail += delta; } } - +out: outval->resblks = mp->m_resblks; outval->resblks_avail = mp->m_resblks_avail; XFS_SB_UNLOCK(mp, s); + + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at it's max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); + if (error == ENOSPC) + goto retry; + } + return 0; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a446e5a115c..b5feb3e7711 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -342,7 +342,7 @@ xfs_ialloc_ag_alloc( return 0; } -STATIC __inline xfs_agnumber_t +STATIC_INLINE xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 2c0e49893ff..bf8e9aff272 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -89,7 +89,6 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; /* * Real block structures have a size equal to the disk block size. */ -#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) #define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0]) #define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0]) #define XFS_INOBT_IS_LAST_REC(cur) \ @@ -110,14 +109,13 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; * Record, key, and pointer address macros for btree blocks. */ #define XFS_INOBT_REC_ADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(0, cur))) + (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i)) + #define XFS_INOBT_KEY_ADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) + (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i)) #define XFS_INOBT_PTR_ADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ + (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \ i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 44dfac52128..3da9829c19d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -47,7 +47,6 @@ #include "xfs_utils.h" #include "xfs_dir2_trace.h" #include "xfs_quota.h" -#include "xfs_mac.h" #include "xfs_acl.h" @@ -1699,8 +1698,7 @@ xfs_itruncate_finish( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(tp, &free_list, first_block, - &committed); + error = xfs_bmap_finish(tp, &free_list, &committed); ntp = *tp; if (error) { /* @@ -1810,7 +1808,7 @@ xfs_igrow_start( * and any blocks between the old and new file sizes. */ error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, - ip->i_d.di_size, new_size); + ip->i_d.di_size); return error; } @@ -2125,7 +2123,7 @@ xfs_iunlink_remove( return 0; } -static __inline__ int xfs_inode_clean(xfs_inode_t *ip) +STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip) { return (((ip->i_itemp == NULL) || !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && @@ -2707,10 +2705,24 @@ xfs_idestroy( ktrace_free(ip->i_dir_trace); #endif if (ip->i_itemp) { - /* XXXdpd should be able to assert this but shutdown - * is leaving the AIL behind. */ - ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_mount_t *mp = ip->i_mount; + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + int s; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + AIL_LOCK(mp, s); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_delete_ail(mp, lip, s); + else + AIL_UNLOCK(mp, s); + } xfs_inode_item_destroy(ip); } kmem_zone_free(xfs_inode_zone, ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a7a92251eb5..565d470a6b4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -887,7 +887,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -STATIC struct xfs_item_ops xfs_inode_item_ops = { +static struct xfs_item_ops xfs_inode_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 19655124da7..cc6a7b5a991 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -43,8 +43,6 @@ #include "xfs_itable.h" #include "xfs_rw.h" #include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -542,7 +540,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error0; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); @@ -838,8 +836,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, - first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto trans_cancel; @@ -947,8 +944,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), - firstfsb, &committed); + error = xfs_bmap_finish(&(tp), &(free_list), &committed); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3cb678e3a13..ca74d3f5910 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1514,7 +1514,6 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *first_item, *itemq, *itemq_next; xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; ushort flags = 0; first_item = itemq = trans->r_itemq; @@ -1522,29 +1521,16 @@ xlog_recover_reorder_trans( do { itemq_next = itemq->ri_next; buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { - case XFS_LI_BUF: - flags = buf_f->blf_flags; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - flags = obuf_f->blf_flags; - break; - } switch (ITEM_TYPE(itemq)) { case XFS_LI_BUF: - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: + flags = buf_f->blf_flags; if (!(flags & XFS_BLI_CANCEL)) { xlog_recover_insert_item_frontq(&trans->r_itemq, itemq); break; } case XFS_LI_INODE: - case XFS_LI_6_1_INODE: - case XFS_LI_5_3_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: case XFS_LI_EFD: @@ -1583,7 +1569,6 @@ xlog_recover_do_buffer_pass1( xfs_buf_cancel_t *nextp; xfs_buf_cancel_t *prevp; xfs_buf_cancel_t **bucket; - xfs_buf_log_format_v1_t *obuf_f; xfs_daddr_t blkno = 0; uint len = 0; ushort flags = 0; @@ -1594,13 +1579,6 @@ xlog_recover_do_buffer_pass1( len = buf_f->blf_len; flags = buf_f->blf_flags; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; } /* @@ -1746,7 +1724,6 @@ xlog_recover_do_buffer_pass2( xlog_t *log, xfs_buf_log_format_t *buf_f) { - xfs_buf_log_format_v1_t *obuf_f; xfs_daddr_t blkno = 0; ushort flags = 0; uint len = 0; @@ -1757,13 +1734,6 @@ xlog_recover_do_buffer_pass2( flags = buf_f->blf_flags; len = buf_f->blf_len; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - flags = obuf_f->blf_flags; - len = (xfs_daddr_t) obuf_f->blf_len; - break; } return xlog_check_buffer_cancelled(log, blkno, len, flags); @@ -1799,7 +1769,6 @@ xlog_recover_do_inode_buffer( int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - xfs_buf_log_format_v1_t *obuf_f; unsigned int *data_map = NULL; unsigned int map_size = 0; @@ -1808,12 +1777,6 @@ xlog_recover_do_inode_buffer( data_map = buf_f->blf_data_map; map_size = buf_f->blf_map_size; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; } /* * Set the variables corresponding to the current region to @@ -1912,7 +1875,6 @@ xlog_recover_do_reg_buffer( int i; int bit; int nbits; - xfs_buf_log_format_v1_t *obuf_f; unsigned int *data_map = NULL; unsigned int map_size = 0; int error; @@ -1922,12 +1884,6 @@ xlog_recover_do_reg_buffer( data_map = buf_f->blf_data_map; map_size = buf_f->blf_map_size; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; } bit = 0; i = 1; /* 0 is the buf format structure */ @@ -2160,7 +2116,6 @@ xlog_recover_do_buffer_trans( int pass) { xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; xfs_mount_t *mp; xfs_buf_t *bp; int error; @@ -2197,13 +2152,6 @@ xlog_recover_do_buffer_trans( len = buf_f->blf_len; flags = buf_f->blf_flags; break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; default: xfs_fs_cmn_err(CE_ALERT, log->l_mp, "xfs_log_recover: unknown buffer type 0x%x, logdev %s", @@ -2830,9 +2778,7 @@ xlog_recover_do_trans( * where xfs_daddr_t is 32-bits but mount will warn us * off a > 1 TB filesystem before we get here. */ - if ((ITEM_TYPE(item) == XFS_LI_BUF) || - (ITEM_TYPE(item) == XFS_LI_6_1_BUF) || - (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) { + if ((ITEM_TYPE(item) == XFS_LI_BUF)) { if ((error = xlog_recover_do_buffer_trans(log, item, pass))) break; @@ -3902,6 +3848,9 @@ xlog_do_recover( ASSERT(XFS_SB_GOOD_VERSION(sbp)); xfs_buf_relse(bp); + /* We've re-read the superblock so re-initialize per-cpu counters */ + xfs_icsb_reinit_counters(log->l_mp); + xlog_recover_check_summary(log); /* Normal transactions can now occur */ diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h deleted file mode 100644 index 18e0e98e03d..00000000000 --- a/fs/xfs/xfs_mac.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_MAC_H__ -#define __XFS_MAC_H__ - -/* - * Mandatory Access Control - * - * Layout of a composite MAC label: - * ml_list contains the list of categories (MSEN) followed by the list of - * divisions (MINT). This is actually a header for the data structure which - * will have an ml_list with more than one element. - * - * ------------------------------- - * | ml_msen_type | ml_mint_type | - * ------------------------------- - * | ml_level | ml_grade | - * ------------------------------- - * | ml_catcount | - * ------------------------------- - * | ml_divcount | - * ------------------------------- - * | category 1 | - * | . . . | - * | category N | (where N = ml_catcount) - * ------------------------------- - * | division 1 | - * | . . . | - * | division M | (where M = ml_divcount) - * ------------------------------- - */ -#define XFS_MAC_MAX_SETS 250 -typedef struct xfs_mac_label { - __uint8_t ml_msen_type; /* MSEN label type */ - __uint8_t ml_mint_type; /* MINT label type */ - __uint8_t ml_level; /* Hierarchical level */ - __uint8_t ml_grade; /* Hierarchical grade */ - __uint16_t ml_catcount; /* Category count */ - __uint16_t ml_divcount; /* Division count */ - /* Category set, then Division set */ - __uint16_t ml_list[XFS_MAC_MAX_SETS]; -} xfs_mac_label_t; - -/* MSEN label type names. Choose an upper case ASCII character. */ -#define XFS_MSEN_ADMIN_LABEL 'A' /* Admin: low<admin != tcsec<high */ -#define XFS_MSEN_EQUAL_LABEL 'E' /* Wildcard - always equal */ -#define XFS_MSEN_HIGH_LABEL 'H' /* System High - always dominates */ -#define XFS_MSEN_MLD_HIGH_LABEL 'I' /* System High, multi-level dir */ -#define XFS_MSEN_LOW_LABEL 'L' /* System Low - always dominated */ -#define XFS_MSEN_MLD_LABEL 'M' /* TCSEC label on a multi-level dir */ -#define XFS_MSEN_MLD_LOW_LABEL 'N' /* System Low, multi-level dir */ -#define XFS_MSEN_TCSEC_LABEL 'T' /* TCSEC label */ -#define XFS_MSEN_UNKNOWN_LABEL 'U' /* unknown label */ - -/* MINT label type names. Choose a lower case ASCII character. */ -#define XFS_MINT_BIBA_LABEL 'b' /* Dual of a TCSEC label */ -#define XFS_MINT_EQUAL_LABEL 'e' /* Wildcard - always equal */ -#define XFS_MINT_HIGH_LABEL 'h' /* High Grade - always dominates */ -#define XFS_MINT_LOW_LABEL 'l' /* Low Grade - always dominated */ - -/* On-disk XFS extended attribute names */ -#define SGI_MAC_FILE "SGI_MAC_FILE" -#define SGI_MAC_FILE_SIZE (sizeof(SGI_MAC_FILE)-1) - - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_MAC - -/* NOT YET IMPLEMENTED */ - -#define MACEXEC 00100 -#define MACWRITE 00200 -#define MACREAD 00400 - -struct xfs_inode; -extern int xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *); - -#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c)) -#define _MAC_VACCESS(v,c,m) (xfs_mac_vaccess(v,c,m)) -#define _MAC_EXISTS xfs_mac_vhaslabel - -#else -#define _MAC_XFS_IACCESS(i,m,c) (0) -#define _MAC_VACCESS(v,c,m) (0) -#define _MAC_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_MAC_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9dfae18d995..3bed0cf0d8a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -52,21 +52,19 @@ STATIC void xfs_unmountfs_wait(xfs_mount_t *); #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); +STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, + int, int); STATIC void xfs_icsb_sync_counters(xfs_mount_t *); STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC int xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t, - int, int); + int64_t, int); STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else #define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) +#define xfs_icsb_balance_counter(mp, a, b, c) do { } while (0) #define xfs_icsb_sync_counters(mp) do { } while (0) #define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) -#define xfs_icsb_modify_counters_locked(mp, a, b, c) do { } while (0) #endif @@ -545,9 +543,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); } - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + /* Initialize per-cpu counters */ + xfs_icsb_reinit_counters(mp); mp->m_sb_bp = bp; xfs_buf_relse(bp); @@ -1254,8 +1251,11 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) * The SB_LOCK must be held when this routine is called. */ int -xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, - int delta, int rsvd) +xfs_mod_incore_sb_unlocked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { int scounter; /* short counter for 32 bit fields */ long long lcounter; /* long counter for 64 bit fields */ @@ -1287,7 +1287,6 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, mp->m_sb.sb_ifree = lcounter; return 0; case XFS_SBS_FDBLOCKS: - lcounter = (long long) mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); @@ -1418,7 +1417,11 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, * routine to do the work. */ int -xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) +xfs_mod_incore_sb( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { unsigned long s; int status; @@ -1485,9 +1488,11 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = xfs_icsb_modify_counters_locked(mp, + XFS_SB_UNLOCK(mp, s); + status = xfs_icsb_modify_counters(mp, msbp->msb_field, msbp->msb_delta, rsvd); + s = XFS_SB_LOCK(mp); break; } /* FALLTHROUGH */ @@ -1521,11 +1526,12 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = - xfs_icsb_modify_counters_locked(mp, + XFS_SB_UNLOCK(mp, s); + status = xfs_icsb_modify_counters(mp, msbp->msb_field, -(msbp->msb_delta), rsvd); + s = XFS_SB_LOCK(mp); break; } /* FALLTHROUGH */ @@ -1733,14 +1739,17 @@ xfs_icsb_cpu_notify( memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); break; case CPU_ONLINE: - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_lock(mp); + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); + xfs_icsb_unlock(mp); break; case CPU_DEAD: /* Disable all the counters, then fold the dead cpu's * count into the total on the global superblock and * re-enable the counters. */ + xfs_icsb_lock(mp); s = XFS_SB_LOCK(mp); xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); @@ -1752,10 +1761,14 @@ xfs_icsb_cpu_notify( memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED); + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, + XFS_ICSB_SB_LOCKED, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, + XFS_ICSB_SB_LOCKED, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, + XFS_ICSB_SB_LOCKED, 0); XFS_SB_UNLOCK(mp, s); + xfs_icsb_unlock(mp); break; } @@ -1784,6 +1797,9 @@ xfs_icsb_init_counters( cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); } + + mutex_init(&mp->m_icsb_mutex); + /* * start with all counters disabled so that the * initial balance kicks us off correctly @@ -1792,6 +1808,22 @@ xfs_icsb_init_counters( return 0; } +void +xfs_icsb_reinit_counters( + xfs_mount_t *mp) +{ + xfs_icsb_lock(mp); + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); + xfs_icsb_unlock(mp); +} + STATIC void xfs_icsb_destroy_counters( xfs_mount_t *mp) @@ -1800,9 +1832,10 @@ xfs_icsb_destroy_counters( unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } + mutex_destroy(&mp->m_icsb_mutex); } -STATIC inline void +STATIC_INLINE void xfs_icsb_lock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1811,7 +1844,7 @@ xfs_icsb_lock_cntr( } } -STATIC inline void +STATIC_INLINE void xfs_icsb_unlock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1819,7 +1852,7 @@ xfs_icsb_unlock_cntr( } -STATIC inline void +STATIC_INLINE void xfs_icsb_lock_all_counters( xfs_mount_t *mp) { @@ -1832,7 +1865,7 @@ xfs_icsb_lock_all_counters( } } -STATIC inline void +STATIC_INLINE void xfs_icsb_unlock_all_counters( xfs_mount_t *mp) { @@ -1888,6 +1921,17 @@ xfs_icsb_disable_counter( ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + /* + * If we are already disabled, then there is nothing to do + * here. We check before locking all the counters to avoid + * the expensive lock operation when being called in the + * slow path and the counter is already disabled. This is + * safe because the only time we set or clear this state is under + * the m_icsb_mutex. + */ + if (xfs_icsb_counter_disabled(mp, field)) + return 0; + xfs_icsb_lock_all_counters(mp); if (!test_and_set_bit(field, &mp->m_icsb_counters)) { /* drain back to superblock */ @@ -1948,8 +1992,8 @@ xfs_icsb_enable_counter( xfs_icsb_unlock_all_counters(mp); } -STATIC void -xfs_icsb_sync_counters_int( +void +xfs_icsb_sync_counters_flags( xfs_mount_t *mp, int flags) { @@ -1981,40 +2025,39 @@ STATIC void xfs_icsb_sync_counters( xfs_mount_t *mp) { - xfs_icsb_sync_counters_int(mp, 0); -} - -/* - * lazy addition used for things like df, background sb syncs, etc - */ -void -xfs_icsb_sync_counters_lazy( - xfs_mount_t *mp) -{ - xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT); + xfs_icsb_sync_counters_flags(mp, 0); } /* * Balance and enable/disable counters as necessary. * - * Thresholds for re-enabling counters are somewhat magic. - * inode counts are chosen to be the same number as single - * on disk allocation chunk per CPU, and free blocks is - * something far enough zero that we aren't going thrash - * when we get near ENOSPC. + * Thresholds for re-enabling counters are somewhat magic. inode counts are + * chosen to be the same number as single on disk allocation chunk per CPU, and + * free blocks is something far enough zero that we aren't going thrash when we + * get near ENOSPC. We also need to supply a minimum we require per cpu to + * prevent looping endlessly when xfs_alloc_space asks for more than will + * be distributed to a single CPU but each CPU has enough blocks to be + * reenabled. + * + * Note that we can be called when counters are already disabled. + * xfs_icsb_disable_counter() optimises the counter locking in this case to + * prevent locking every per-cpu counter needlessly. */ -#define XFS_ICSB_INO_CNTR_REENABLE 64 + +#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (512 + XFS_ALLOC_SET_ASIDE(mp)) + (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) STATIC void xfs_icsb_balance_counter( xfs_mount_t *mp, xfs_sb_field_t field, - int flags) + int flags, + int min_per_cpu) { uint64_t count, resid; int weight = num_online_cpus(); int s; + uint64_t min = (uint64_t)min_per_cpu; if (!(flags & XFS_ICSB_SB_LOCKED)) s = XFS_SB_LOCK(mp); @@ -2027,19 +2070,19 @@ xfs_icsb_balance_counter( case XFS_SBS_ICOUNT: count = mp->m_sb.sb_icount; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) goto out; break; case XFS_SBS_IFREE: count = mp->m_sb.sb_ifree; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) goto out; break; case XFS_SBS_FDBLOCKS: count = mp->m_sb.sb_fdblocks; resid = do_div(count, weight); - if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp)) + if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) goto out; break; default: @@ -2054,32 +2097,39 @@ out: XFS_SB_UNLOCK(mp, s); } -STATIC int -xfs_icsb_modify_counters_int( +int +xfs_icsb_modify_counters( xfs_mount_t *mp, xfs_sb_field_t field, - int delta, - int rsvd, - int flags) + int64_t delta, + int rsvd) { xfs_icsb_cnts_t *icsbp; long long lcounter; /* long counter for 64 bit fields */ - int cpu, s, locked = 0; - int ret = 0, balance_done = 0; + int cpu, ret = 0, s; + might_sleep(); again: cpu = get_cpu(); - icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu), - xfs_icsb_lock_cntr(icsbp); + icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); + + /* + * if the counter is disabled, go to slow path + */ if (unlikely(xfs_icsb_counter_disabled(mp, field))) goto slow_path; + xfs_icsb_lock_cntr(icsbp); + if (unlikely(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock_cntr(icsbp); + goto slow_path; + } switch (field) { case XFS_SBS_ICOUNT: lcounter = icsbp->icsb_icount; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_icount = lcounter; break; @@ -2087,7 +2137,7 @@ again: lcounter = icsbp->icsb_ifree; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_ifree = lcounter; break; @@ -2097,7 +2147,7 @@ again: lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); break; default: @@ -2106,72 +2156,78 @@ again: } xfs_icsb_unlock_cntr(icsbp); put_cpu(); - if (locked) - XFS_SB_UNLOCK(mp, s); return 0; - /* - * The slow path needs to be run with the SBLOCK - * held so that we prevent other threads from - * attempting to run this path at the same time. - * this provides exclusion for the balancing code, - * and exclusive fallback if the balance does not - * provide enough resources to continue in an unlocked - * manner. - */ slow_path: - xfs_icsb_unlock_cntr(icsbp); put_cpu(); - /* need to hold superblock incase we need - * to disable a counter */ - if (!(flags & XFS_ICSB_SB_LOCKED)) { - s = XFS_SB_LOCK(mp); - locked = 1; - flags |= XFS_ICSB_SB_LOCKED; - } - if (!balance_done) { - xfs_icsb_balance_counter(mp, field, flags); - balance_done = 1; + /* + * serialise with a mutex so we don't burn lots of cpu on + * the superblock lock. We still need to hold the superblock + * lock, however, when we modify the global structures. + */ + xfs_icsb_lock(mp); + + /* + * Now running atomically. + * + * If the counter is enabled, someone has beaten us to rebalancing. + * Drop the lock and try again in the fast path.... + */ + if (!(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock(mp); goto again; - } else { - /* - * we might not have enough on this local - * cpu to allocate for a bulk request. - * We need to drain this field from all CPUs - * and disable the counter fastpath - */ - xfs_icsb_disable_counter(mp, field); } + /* + * The counter is currently disabled. Because we are + * running atomically here, we know a rebalance cannot + * be in progress. Hence we can go straight to operating + * on the global superblock. We do not call xfs_mod_incore_sb() + * here even though we need to get the SB_LOCK. Doing so + * will cause us to re-enter this function and deadlock. + * Hence we get the SB_LOCK ourselves and then call + * xfs_mod_incore_sb_unlocked() as the unlocked path operates + * directly on the global counters. + */ + s = XFS_SB_LOCK(mp); ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + XFS_SB_UNLOCK(mp, s); - if (locked) - XFS_SB_UNLOCK(mp, s); + /* + * Now that we've modified the global superblock, we + * may be able to re-enable the distributed counters + * (e.g. lots of space just got freed). After that + * we are done. + */ + if (ret != ENOSPC) + xfs_icsb_balance_counter(mp, field, 0, 0); + xfs_icsb_unlock(mp); return ret; -} -STATIC int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0); -} +balance_counter: + xfs_icsb_unlock_cntr(icsbp); + put_cpu(); -/* - * Called when superblock is already locked - */ -STATIC int -xfs_icsb_modify_counters_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, - rsvd, XFS_ICSB_SB_LOCKED); + /* + * We may have multiple threads here if multiple per-cpu + * counters run dry at the same time. This will mean we can + * do more balances than strictly necessary but it is not + * the common slowpath case. + */ + xfs_icsb_lock(mp); + + /* + * running atomically. + * + * This will leave the counter in the correct state for future + * accesses. After the rebalance, we simply try again and our retry + * will either succeed through the fast path or slow path without + * another balance operation being required. + */ + xfs_icsb_balance_counter(mp, field, 0, delta); + xfs_icsb_unlock(mp); + goto again; } + #endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e5f396ff9a3..82304b94646 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,6 +18,7 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ + typedef struct xfs_trans_reservations { uint tr_write; /* extent alloc trans */ uint tr_itruncate; /* truncate trans */ @@ -306,11 +307,13 @@ typedef struct xfs_icsb_cnts { #define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters_lazy(struct xfs_mount *); +extern void xfs_icsb_reinit_counters(struct xfs_mount *); +extern void xfs_icsb_sync_counters_flags(struct xfs_mount *, int); #else #define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_sync_counters_lazy(mp) do { } while (0) +#define xfs_icsb_reinit_counters(mp) do { } while (0) +#define xfs_icsb_sync_counters_flags(mp, flags) do { } while (0) #endif typedef struct xfs_mount { @@ -419,6 +422,7 @@ typedef struct xfs_mount { xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ unsigned long m_icsb_counters; /* disabled per-cpu counters */ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ + struct mutex m_icsb_mutex; /* balancer sync lock */ #endif } xfs_mount_t; @@ -563,11 +567,32 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* + * Per-cpu superblock locking functions + */ +#ifdef HAVE_PERCPU_SB +STATIC_INLINE void +xfs_icsb_lock(xfs_mount_t *mp) +{ + mutex_lock(&mp->m_icsb_mutex); +} + +STATIC_INLINE void +xfs_icsb_unlock(xfs_mount_t *mp) +{ + mutex_unlock(&mp->m_icsb_mutex); +} +#else +#define xfs_icsb_lock(mp) +#define xfs_icsb_unlock(mp) +#endif + +/* * This structure is for use by the xfs_mod_incore_sb_batch() routine. + * xfs_growfs can specify a few fields which are more than int limit */ typedef struct xfs_mod_sb { xfs_sb_field_t msb_field; /* Field to modify, see below */ - int msb_delta; /* Change to make to specified field */ + int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; #define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) @@ -585,17 +610,17 @@ extern int xfs_unmountfs(xfs_mount_t *, struct cred *); extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_unmount_flush(xfs_mount_t *, int); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int); +extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int, int); + int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); -extern int xfs_syncsub(xfs_mount_t *, int, int, int *); -extern int xfs_sync_inodes(xfs_mount_t *, int, int, int *); +extern int xfs_syncsub(xfs_mount_t *, int, int *); +extern int xfs_sync_inodes(xfs_mount_t *, int, int *); extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, xfs_agnumber_t); extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d98171deaa1..4c6573d784c 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -565,7 +565,7 @@ xfs_rename( IHOLD(target_ip); IHOLD(src_ip); - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 880c73271c0..6fff19dc3cf 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -147,7 +147,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, firstblock, &committed); + error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto error_exit; xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); @@ -913,57 +913,6 @@ xfs_rtcheck_alloc_range( } #endif -#ifdef DEBUG -/* - * Check whether the given block in the bitmap has the given value. - */ -STATIC int /* 1 for matches, 0 for not */ -xfs_rtcheck_bit( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* bit (block) to check */ - int val) /* 1 for free, 0 for allocated */ -{ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* pointer into the buffer */ - /* REFERENCED */ - int error; /* error value */ - xfs_rtword_t wdiff; /* difference between bit & expected */ - int word; /* word number in the buffer */ - xfs_rtword_t wval; /* word value from buffer */ - - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = XFS_BITTOWORD(mp, start); - bit = (int)(start & (XFS_NBWORD - 1)); - wval = bufp[word]; - xfs_trans_brelse(tp, bp); - wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit); - return !wdiff; -} -#endif /* DEBUG */ - -#if 0 -/* - * Check that the given extent (block range) is free already. - */ -STATIC int /* error */ -xfs_rtcheck_free_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for free, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat); -} -#endif - /* * Check that the given range is either all allocated (val = 0) or * all free (val = 1). @@ -2382,60 +2331,3 @@ xfs_rtpick_extent( *pick = b; return 0; } - -#ifdef DEBUG -/* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len) /* length to print */ -{ - xfs_extlen_t i; /* block number in the extent */ - - cmn_err(CE_DEBUG, "%Ld: ", (long long)start); - for (i = 0; i < len; i++) - cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1)); - cmn_err(CE_DEBUG, "\n"); -} - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_suminfo_t c; /* summary data */ - xfs_rtblock_t i; /* bitmap block number */ - int l; /* summary information level */ - int p; /* flag for printed anything */ - xfs_fsblock_t sb; /* summary block number */ - xfs_buf_t *sumbp; /* summary block buffer */ - - sumbp = NULL; - for (l = 0; l < mp->m_rsumlevels; l++) { - for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c); - if (c) { - if (!p) { - cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l, - XFS_RTMIN((1LL << l) + - ((1LL << l) - 1LL), - mp->m_sb.sb_rextents)); - p = 1; - } - cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c); - } - } - if (p) - cmn_err(CE_DEBUG, "\n"); - } - if (sumbp) - xfs_trans_brelse(tp, sumbp); -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0e0b4d2ec20..799c1f87126 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -134,24 +134,6 @@ xfs_rtpick_extent( xfs_rtblock_t *pick); /* result rt extent */ /* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len); /* length to print */ - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp); /* transaction pointer */ - -/* * Grow the realtime area of the filesystem. */ int diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index defb2febaaf..1ea7c0ca6ae 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -42,7 +42,6 @@ #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_acl.h" -#include "xfs_mac.h" #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_rw.h" diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ee2721e0de4..301ff9445b6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -339,7 +339,7 @@ xfs_trans_reserve( */ if (blocks > 0) { error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - -blocks, rsvd); + -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return (XFS_ERROR(ENOSPC)); @@ -380,7 +380,7 @@ xfs_trans_reserve( */ if (rtextents > 0) { error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -rtextents, rsvd); + -((int64_t)rtextents), rsvd); if (error) { error = XFS_ERROR(ENOSPC); goto undo_log; @@ -410,7 +410,7 @@ undo_log: undo_blocks: if (blocks > 0) { (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - blocks, rsvd); + (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -432,7 +432,7 @@ void xfs_trans_mod_sb( xfs_trans_t *tp, uint field, - long delta) + int64_t delta) { switch (field) { @@ -663,62 +663,62 @@ xfs_trans_unreserve_and_mod_sb( if (tp->t_flags & XFS_TRANS_SB_DIRTY) { if (tp->t_icount_delta != 0) { msbp->msb_field = XFS_SBS_ICOUNT; - msbp->msb_delta = (int)tp->t_icount_delta; + msbp->msb_delta = tp->t_icount_delta; msbp++; } if (tp->t_ifree_delta != 0) { msbp->msb_field = XFS_SBS_IFREE; - msbp->msb_delta = (int)tp->t_ifree_delta; + msbp->msb_delta = tp->t_ifree_delta; msbp++; } if (tp->t_fdblocks_delta != 0) { msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = (int)tp->t_fdblocks_delta; + msbp->msb_delta = tp->t_fdblocks_delta; msbp++; } if (tp->t_frextents_delta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = (int)tp->t_frextents_delta; + msbp->msb_delta = tp->t_frextents_delta; msbp++; } if (tp->t_dblocks_delta != 0) { msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = (int)tp->t_dblocks_delta; + msbp->msb_delta = tp->t_dblocks_delta; msbp++; } if (tp->t_agcount_delta != 0) { msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = (int)tp->t_agcount_delta; + msbp->msb_delta = tp->t_agcount_delta; msbp++; } if (tp->t_imaxpct_delta != 0) { msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = (int)tp->t_imaxpct_delta; + msbp->msb_delta = tp->t_imaxpct_delta; msbp++; } if (tp->t_rextsize_delta != 0) { msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = (int)tp->t_rextsize_delta; + msbp->msb_delta = tp->t_rextsize_delta; msbp++; } if (tp->t_rbmblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = (int)tp->t_rbmblocks_delta; + msbp->msb_delta = tp->t_rbmblocks_delta; msbp++; } if (tp->t_rblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = (int)tp->t_rblocks_delta; + msbp->msb_delta = tp->t_rblocks_delta; msbp++; } if (tp->t_rextents_delta != 0) { msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = (int)tp->t_rextents_delta; + msbp->msb_delta = tp->t_rextents_delta; msbp++; } if (tp->t_rextslog_delta != 0) { msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = (int)tp->t_rextslog_delta; + msbp->msb_delta = tp->t_rextslog_delta; msbp++; } } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c68e00105d2..f1d7ab23672 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -39,13 +39,9 @@ typedef struct xfs_trans_header { /* * Log item types. */ -#define XFS_LI_5_3_BUF 0x1234 /* v1 bufs, 1-block inode buffers */ -#define XFS_LI_5_3_INODE 0x1235 /* 1-block inode buffers */ #define XFS_LI_EFI 0x1236 #define XFS_LI_EFD 0x1237 #define XFS_LI_IUNLINK 0x1238 -#define XFS_LI_6_1_INODE 0x1239 /* 4K non-aligned inode bufs */ -#define XFS_LI_6_1_BUF 0x123a /* v1, 4K inode buffers */ #define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ #define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ #define XFS_LI_DQUOT 0x123d @@ -354,25 +350,25 @@ typedef struct xfs_trans { xfs_trans_callback_t t_callback; /* transaction callback */ void *t_callarg; /* callback arg */ unsigned int t_flags; /* misc flags */ - long t_icount_delta; /* superblock icount change */ - long t_ifree_delta; /* superblock ifree change */ - long t_fdblocks_delta; /* superblock fdblocks chg */ - long t_res_fdblocks_delta; /* on-disk only chg */ - long t_frextents_delta;/* superblock freextents chg*/ - long t_res_frextents_delta; /* on-disk only chg */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ #ifdef DEBUG - long t_ag_freeblks_delta; /* debugging counter */ - long t_ag_flist_delta; /* debugging counter */ - long t_ag_btree_delta; /* debugging counter */ + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ #endif - long t_dblocks_delta;/* superblock dblocks change */ - long t_agcount_delta;/* superblock agcount change */ - long t_imaxpct_delta;/* superblock imaxpct change */ - long t_rextsize_delta;/* superblock rextsize chg */ - long t_rbmblocks_delta;/* superblock rbmblocks chg */ - long t_rblocks_delta;/* superblock rblocks change */ - long t_rextents_delta;/* superblocks rextents chg */ - long t_rextslog_delta;/* superblocks rextslog chg */ + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ @@ -936,9 +932,9 @@ typedef struct xfs_trans { #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) #ifdef DEBUG -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (long)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (long)d) +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) #else #define xfs_trans_agblocks_delta(tp, d) #define xfs_trans_agflist_delta(tp, d) @@ -954,7 +950,7 @@ xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); -void xfs_trans_mod_sb(xfs_trans_t *, uint, long); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, int, uint); int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index fc39b166d40..ceb4f6e9996 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -90,7 +90,7 @@ xfs_trans_push_ail( int flush_log; SPLDECL(s); -#define XFS_TRANS_PUSH_AIL_RESTARTS 10 +#define XFS_TRANS_PUSH_AIL_RESTARTS 1000 AIL_LOCK(mp,s); lip = xfs_trans_first_ail(mp, &gen); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 62336a4cc5a..29f72f61378 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -640,7 +640,7 @@ xfs_quiesce_fs( * we can write the unmount record. */ do { - xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, 0, NULL); + xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, NULL); pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); if (!pincount) { delay(50); @@ -806,7 +806,7 @@ xfs_statvfs( statp->f_type = XFS_SB_MAGIC; - xfs_icsb_sync_counters_lazy(mp); + xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT); s = XFS_SB_LOCK(mp); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; @@ -872,6 +872,10 @@ xfs_statvfs( * this by simply making sure the log gets flushed * if SYNC_BDFLUSH is set, and by actually writing it * out otherwise. + * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete + * before we return (including direct I/O). Forms the drain + * side of the write barrier needed to safely quiesce the + * filesystem. * */ /*ARGSUSED*/ @@ -883,27 +887,20 @@ xfs_sync( { xfs_mount_t *mp = XFS_BHVTOM(bdp); - if (unlikely(flags == SYNC_QUIESCE)) - return xfs_quiesce_fs(mp); - else - return xfs_syncsub(mp, flags, 0, NULL); + return xfs_syncsub(mp, flags, NULL); } /* * xfs sync routine for internal use * * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. + * interface as explained above under xfs_sync. * */ int xfs_sync_inodes( xfs_mount_t *mp, int flags, - int xflags, int *bypassed) { xfs_inode_t *ip = NULL; @@ -1176,6 +1173,13 @@ xfs_sync_inodes( } } + /* + * When freezing, we need to wait ensure all I/O (including direct + * I/O) is complete to ensure no further data modification can take + * place after this point + */ + if (flags & SYNC_IOWAIT) + vn_iowait(vp); if (flags & SYNC_BDFLUSH) { if ((flags & SYNC_ATTR) && @@ -1412,17 +1416,13 @@ xfs_sync_inodes( * xfs sync routine for internal use * * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. + * interface as explained above under xfs_sync. * */ int xfs_syncsub( xfs_mount_t *mp, int flags, - int xflags, int *bypassed) { int error = 0; @@ -1444,7 +1444,7 @@ xfs_syncsub( if (flags & SYNC_BDFLUSH) xfs_finish_reclaim_all(mp, 1); else - error = xfs_sync_inodes(mp, flags, xflags, bypassed); + error = xfs_sync_inodes(mp, flags, bypassed); } /* @@ -1958,15 +1958,26 @@ xfs_showargs( return 0; } +/* + * Second stage of a freeze. The data is already frozen, now we have to take + * care of the metadata. New transactions are already blocked, so we need to + * wait for any remaining transactions to drain out before proceding. + */ STATIC void xfs_freeze( bhv_desc_t *bdp) { xfs_mount_t *mp = XFS_BHVTOM(bdp); + /* wait for all modifications to complete */ while (atomic_read(&mp->m_active_trans) > 0) delay(100); + /* flush inodes and push all remaining buffers out to disk */ + xfs_quiesce_fs(mp); + + ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + /* Push the superblock and write an unmount record */ xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index bda774a04b8..52c41714ec5 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -51,7 +51,6 @@ #include "xfs_refcache.h" #include "xfs_trans_space.h" #include "xfs_log_priv.h" -#include "xfs_mac.h" STATIC int xfs_open( @@ -1381,7 +1380,7 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed))) + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) goto error1; /* * The transaction must have been committed, since there were @@ -1790,8 +1789,7 @@ xfs_inactive( * Just ignore errors at this point. There is * nothing we can do except to try to keep going. */ - (void) xfs_bmap_finish(&tp, &free_list, first_block, - &committed); + (void) xfs_bmap_finish(&tp, &free_list, &committed); (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); } /* @@ -2022,7 +2020,7 @@ xfs_create( IHOLD(ip); vp = XFS_ITOV(ip); - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); goto abort_rele; @@ -2507,7 +2505,7 @@ xfs_remove( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { REMOVE_DEBUG_TRACE(__LINE__); goto error_rele; @@ -2715,7 +2713,7 @@ xfs_link( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); goto abort_return; @@ -2932,7 +2930,7 @@ xfs_mkdir( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { IRELE(cdp); goto error2; @@ -3183,7 +3181,7 @@ xfs_rmdir( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | @@ -3533,7 +3531,7 @@ xfs_symlink( */ IHOLD(ip); - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error2; } @@ -4145,7 +4143,7 @@ retry: /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error0; } @@ -4452,7 +4450,7 @@ xfs_free_file_space( /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { goto error0; } |