summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c3
-rw-r--r--fs/afs/cell.c56
-rw-r--r--fs/afs/dir.c47
-rw-r--r--fs/afs/inode.c86
-rw-r--r--fs/afs/internal.h23
-rw-r--r--fs/afs/mntpt.c78
-rw-r--r--fs/afs/proc.c2
-rw-r--r--fs/afs/super.c20
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/binfmt_script.c3
-rw-r--r--fs/buffer.c69
-rw-r--r--fs/cachefiles/internal.h13
-rw-r--r--fs/ceph/addr.c12
-rw-r--r--fs/ceph/auth_x.c15
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/debugfs.c4
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c5
-rw-r--r--fs/ceph/locks.c14
-rw-r--r--fs/ceph/mds_client.c101
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/osd_client.c2
-rw-r--r--fs/ceph/snap.c89
-rw-r--r--fs/ceph/super.h11
-rw-r--r--fs/ceph/xattr.c1
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/README10
-rw-r--r--fs/cifs/asn1.c6
-rw-r--r--fs/cifs/cifs_unicode.h18
-rw-r--r--fs/cifs/cifs_uniupr.h16
-rw-r--r--fs/cifs/cifsencrypt.c475
-rw-r--r--fs/cifs/cifsglob.h25
-rw-r--r--fs/cifs/cifspdu.h7
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c13
-rw-r--r--fs/cifs/connect.c17
-rw-r--r--fs/cifs/dir.c157
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/ntlmssp.h13
-rw-r--r--fs/cifs/sess.c132
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/compat.c23
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c73
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/inode.c31
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/exec.c25
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/file_table.c133
-rw-r--r--fs/fs-writeback.c81
-rw-r--r--fs/fs_struct.c32
-rw-r--r--fs/fscache/internal.h14
-rw-r--r--fs/fuse/dev.c42
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/internal.h7
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/jbd/checkpoint.c4
-rw-r--r--fs/jbd/commit.c49
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/revoke.c2
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--fs/jbd2/commit.c39
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jbd2/transaction.c46
-rw-r--r--fs/logfs/dir.c2
-rw-r--r--fs/logfs/file.c6
-rw-r--r--fs/logfs/logfs.h3
-rw-r--r--fs/mbcache.c30
-rw-r--r--fs/namei.c119
-rw-r--r--fs/namespace.c200
-rw-r--r--fs/nfs/Kconfig18
-rw-r--r--fs/nfs/callback.c11
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/dns_resolve.c24
-rw-r--r--fs/nfs/dns_resolve.h12
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/nfs4proc.c11
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4state.c28
-rw-r--r--fs/nfsd/state.h14
-rw-r--r--fs/nfsd/vfs.c14
-rw-r--r--fs/nilfs2/super.c32
-rw-r--r--fs/nilfs2/the_nilfs.c9
-rw-r--r--fs/notify/fanotify/fanotify.c11
-rw-r--r--fs/notify/fanotify/fanotify_user.c35
-rw-r--r--fs/notify/fsnotify.c80
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c12
-rw-r--r--fs/notify/notification.c33
-rw-r--r--fs/ocfs2/acl.c33
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c17
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c9
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/file.c36
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/inode.h11
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c8
-rw-r--r--fs/ocfs2/namei.c302
-rw-r--r--fs/ocfs2/ocfs2.h23
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h95
-rw-r--r--fs/ocfs2/refcounttree.c25
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/suballoc.c219
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/open.c4
-rw-r--r--fs/partitions/ibm.c2
-rw-r--r--fs/pnode.c11
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/stat.c29
-rw-r--r--fs/super.c18
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/ialloc.c18
-rw-r--r--fs/ufs/truncate.c18
-rw-r--r--fs/ufs/util.c20
-rw-r--r--fs/ufs/util.h3
-rw-r--r--fs/utimes.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c42
-rw-r--r--fs/xfs/xfs_bmap.c14
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c31
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c16
-rw-r--r--fs/xfs/xfs_inode.c49
-rw-r--r--fs/xfs/xfs_log.c7
-rw-r--r--fs/xfs/xfs_log_cil.c263
-rw-r--r--fs/xfs/xfs_log_priv.h13
-rw-r--r--fs/xfs/xfs_trans.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c13
156 files changed, 3393 insertions, 1614 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 35856368906..6406f896bf9 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
}
kfree(wnames);
fid_out:
- v9fs_fid_add(dentry, fid);
+ if (!IS_ERR(fid))
+ v9fs_fid_add(dentry, fid);
err_out:
up_read(&v9ses->rename_sem);
return fid;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ffea35c6387..0d5eeadf612 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -31,21 +31,20 @@ static struct afs_cell *afs_cell_root;
* allocate a cell record and fill in its name, VL server address list and
* allocate an anonymous key
*/
-static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+ char *vllist)
{
struct afs_cell *cell;
struct key *key;
- size_t namelen;
char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
char *dvllist = NULL, *_vllist = NULL;
char delimiter = ':';
int ret;
- _enter("%s,%s", name, vllist);
+ _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
- namelen = strlen(name);
if (namelen > AFS_MAXCELLNAME) {
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
@@ -73,6 +72,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
if (!vllist || strlen(vllist) < 7) {
ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
if (ret < 0) {
+ if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+ /* translate these errors into something
+ * userspace might understand */
+ ret = -EDESTADDRREQ;
_leave(" = %d", ret);
return ERR_PTR(ret);
}
@@ -138,26 +141,29 @@ error:
}
/*
- * create a cell record
- * - "name" is the name of the cell
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * afs_cell_crate() - create a cell record
+ * @name: is the name of the cell.
+ * @namsesz: is the strlen of the cell name.
+ * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref: is T to return the cell reference when the cell exists.
*/
-struct afs_cell *afs_cell_create(const char *name, char *vllist)
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+ char *vllist, bool retref)
{
struct afs_cell *cell;
int ret;
- _enter("%s,%s", name, vllist);
+ _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
down_write(&afs_cells_sem);
read_lock(&afs_cells_lock);
list_for_each_entry(cell, &afs_cells, link) {
- if (strcasecmp(cell->name, name) == 0)
+ if (strncasecmp(cell->name, name, namesz) == 0)
goto duplicate_name;
}
read_unlock(&afs_cells_lock);
- cell = afs_cell_alloc(name, vllist);
+ cell = afs_cell_alloc(name, namesz, vllist);
if (IS_ERR(cell)) {
_leave(" = %ld", PTR_ERR(cell));
up_write(&afs_cells_sem);
@@ -197,8 +203,18 @@ error:
return ERR_PTR(ret);
duplicate_name:
+ if (retref && !IS_ERR(cell))
+ afs_get_cell(cell);
+
read_unlock(&afs_cells_lock);
up_write(&afs_cells_sem);
+
+ if (retref) {
+ _leave(" = %p", cell);
+ return cell;
+ }
+
+ _leave(" = -EEXIST");
return ERR_PTR(-EEXIST);
}
@@ -229,7 +245,7 @@ int afs_cell_init(char *rootcell)
*cp++ = 0;
/* allocate a cell record for the root cell */
- new_root = afs_cell_create(rootcell, cp);
+ new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
return PTR_ERR(new_root);
@@ -249,11 +265,12 @@ int afs_cell_init(char *rootcell)
/*
* lookup a cell record
*/
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+ bool dns_cell)
{
struct afs_cell *cell;
- _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
+ _enter("\"%*.*s\",", namesz, namesz, name ?: "");
down_read(&afs_cells_sem);
read_lock(&afs_cells_lock);
@@ -267,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
}
}
cell = ERR_PTR(-ENOENT);
+ if (dns_cell)
+ goto create_cell;
found:
;
} else {
@@ -289,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
up_read(&afs_cells_sem);
_leave(" = %p", cell);
return cell;
+
+create_cell:
+ read_unlock(&afs_cells_lock);
+ up_read(&afs_cells_sem);
+
+ cell = afs_cell_create(name, namesz, NULL, true);
+
+ _leave(" = %p", cell);
+ return cell;
}
#if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b42d5cc1d6d..0d38c09bd55 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -477,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
}
/*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+ int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+ struct afs_fid *fid)
+{
+ const char *devname = dentry->d_name.name;
+ struct afs_vnode *vnode = AFS_FS_I(dir);
+ struct inode *inode;
+
+ _enter("%d, %p{%s}, {%x:%u}, %p",
+ ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+
+ if (ret != -ENOENT ||
+ !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+ goto out;
+
+ inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ *fid = AFS_FS_I(inode)->fid;
+ _leave("= %p", inode);
+ return inode;
+
+out:
+ _leave("= %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
* look up an entry in a directory
*/
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -520,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
ret = afs_do_lookup(dir, dentry, &fid, key);
if (ret < 0) {
+ inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+ if (!IS_ERR(inode)) {
+ key_put(key);
+ goto success;
+ }
+
+ ret = PTR_ERR(inode);
key_put(key);
if (ret == -ENOENT) {
d_add(dentry, NULL);
@@ -539,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(inode);
}
+success:
dentry->d_op = &afs_fs_dentry_operations;
d_add(dentry, inode);
@@ -696,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
goto zap;
if (dentry->d_inode &&
- test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
- goto zap;
+ (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) ||
+ test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+ goto zap;
_leave(" = 0 [keep]");
return 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 320ffef1157..0747339011c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
#include "internal.h"
struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
}
/*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+ return 0;
+}
+
+/*
* iget5() inode initialiser
*/
static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
}
/*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+ int namesz, struct key *key)
+{
+ struct afs_iget_data data;
+ struct afs_super_info *as;
+ struct afs_vnode *vnode;
+ struct super_block *sb;
+ struct inode *inode;
+ static atomic_t afs_autocell_ino;
+
+ _enter("{%x:%u},%*.*s,",
+ AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+ namesz, namesz, dev_name ?: "");
+
+ sb = dir->i_sb;
+ as = sb->s_fs_info;
+ data.volume = as->volume;
+ data.fid.vid = as->volume->vid;
+ data.fid.unique = 0;
+ data.fid.vnode = 0;
+
+ inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+ afs_iget5_autocell_test, afs_iget5_set,
+ &data);
+ if (!inode) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+ inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+ data.fid.unique);
+
+ vnode = AFS_FS_I(inode);
+
+ /* there shouldn't be an existing inode */
+ BUG_ON(!(inode->i_state & I_NEW));
+
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ inode->i_op = &afs_autocell_inode_operations;
+ inode->i_nlink = 2;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_ctime.tv_sec = get_seconds();
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime;
+ inode->i_blocks = 0;
+ inode->i_version = 0;
+ inode->i_generation = 0;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+ inode->i_flags |= S_NOATIME;
+ unlock_new_inode(inode);
+ _leave(" = %p", inode);
+ return inode;
+}
+
+/*
* inode retrieval
*/
struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,6 +387,19 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
}
/*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+ _enter("");
+
+ if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+ return generic_delete_inode(inode);
+ else
+ return generic_drop_inode(inode);
+}
+
+/*
* clear an AFS inode
*/
void afs_evict_inode(struct inode *inode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8679089ce9a..cca8eef736f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
struct afs_mount_params {
bool rwpath; /* T if the parent should be considered R/W */
bool force; /* T to force cell type */
+ bool autocell; /* T if set auto mount operation */
afs_voltype_t type; /* type of volume requested */
int volnamesz; /* size of volume name */
const char *volname; /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */
long acl_order; /* ACL check count (callback break count) */
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, char *);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
extern struct afs_cell *afs_grab_cell(struct afs_cell *);
extern void afs_put_cell(struct afs_cell *);
extern void afs_cell_purge(void);
@@ -558,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
/*
* inode.c
*/
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+ struct key *);
extern struct inode *afs_iget(struct super_block *, struct key *,
struct afs_fid *, struct afs_file_status *,
struct afs_callback *);
@@ -566,6 +571,7 @@ extern int afs_validate(struct afs_vnode *, struct key *);
extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int afs_setattr(struct dentry *, struct iattr *);
extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
/*
* main.c
@@ -581,6 +587,7 @@ extern int afs_abort_to_error(u32);
* mntpt.c
*/
extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
extern const struct file_operations afs_mntpt_file_operations;
extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -752,12 +759,6 @@ extern unsigned afs_debug;
#define dbgprintk(FMT,...) \
printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
@@ -792,9 +793,9 @@ do { \
} while (0)
#else
-#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
#endif
/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a9e23039ea3..6d552686c49 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -38,6 +38,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
.getattr = afs_getattr,
};
+const struct inode_operations afs_autocell_inode_operations = {
+ .follow_link = afs_mntpt_follow_link,
+ .getattr = afs_getattr,
+};
+
static LIST_HEAD(afs_vfsmounts);
static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
@@ -136,20 +141,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
{
struct afs_super_info *super;
struct vfsmount *mnt;
+ struct afs_vnode *vnode;
struct page *page;
- size_t size;
- char *buf, *devname, *options;
+ char *devname, *options;
+ bool rwpath = false;
int ret;
_enter("{%s}", mntpt->d_name.name);
BUG_ON(!mntpt->d_inode);
- ret = -EINVAL;
- size = mntpt->d_inode->i_size;
- if (size > PAGE_SIZE - 1)
- goto error_no_devname;
-
ret = -ENOMEM;
devname = (char *) get_zeroed_page(GFP_KERNEL);
if (!devname)
@@ -159,28 +160,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
if (!options)
goto error_no_options;
- /* read the contents of the AFS special symlink */
- page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto error_no_page;
+ vnode = AFS_FS_I(mntpt->d_inode);
+ if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
+ /* if the directory is a pseudo directory, use the d_name */
+ static const char afs_root_cell[] = ":root.cell.";
+ unsigned size = mntpt->d_name.len;
+
+ ret = -ENOENT;
+ if (size < 2 || size > AFS_MAXCELLNAME)
+ goto error_no_page;
+
+ if (mntpt->d_name.name[0] == '.') {
+ devname[0] = '#';
+ memcpy(devname + 1, mntpt->d_name.name, size - 1);
+ memcpy(devname + size, afs_root_cell,
+ sizeof(afs_root_cell));
+ rwpath = true;
+ } else {
+ devname[0] = '%';
+ memcpy(devname + 1, mntpt->d_name.name, size);
+ memcpy(devname + size + 1, afs_root_cell,
+ sizeof(afs_root_cell));
+ }
+ } else {
+ /* read the contents of the AFS special symlink */
+ loff_t size = i_size_read(mntpt->d_inode);
+ char *buf;
+
+ ret = -EINVAL;
+ if (size > PAGE_SIZE - 1)
+ goto error_no_page;
+
+ page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto error_no_page;
+ }
+
+ ret = -EIO;
+ if (PageError(page))
+ goto error;
+
+ buf = kmap_atomic(page, KM_USER0);
+ memcpy(devname, buf, size);
+ kunmap_atomic(buf, KM_USER0);
+ page_cache_release(page);
+ page = NULL;
}
- ret = -EIO;
- if (PageError(page))
- goto error;
-
- buf = kmap_atomic(page, KM_USER0);
- memcpy(devname, buf, size);
- kunmap_atomic(buf, KM_USER0);
- page_cache_release(page);
- page = NULL;
-
/* work out what options we want */
super = AFS_FS_S(mntpt->d_sb);
memcpy(options, "cell=", 5);
strcpy(options + 5, super->volume->cell->name);
- if (super->volume->type == AFSVL_RWVOL)
+ if (super->volume->type == AFSVL_RWVOL || rwpath)
strcat(options, ",rwpath");
/* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a..096b23f821a 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
if (strcmp(kbuf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_cell_create(name, args);
+ cell = afs_cell_create(name, strlen(name), args, false);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
goto done;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 9cf80f02da1..77e1e5a6115 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,6 +16,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
@@ -48,6 +49,7 @@ struct file_system_type afs_fs_type = {
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
+ .drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.evict_inode = afs_evict_inode,
.put_super = afs_put_super,
@@ -62,12 +64,14 @@ enum {
afs_opt_cell,
afs_opt_rwpath,
afs_opt_vol,
+ afs_opt_autocell,
};
static const match_table_t afs_options_list = {
{ afs_opt_cell, "cell=%s" },
{ afs_opt_rwpath, "rwpath" },
{ afs_opt_vol, "vol=%s" },
+ { afs_opt_autocell, "autocell" },
{ afs_no_opt, NULL },
};
@@ -151,7 +155,8 @@ static int afs_parse_options(struct afs_mount_params *params,
switch (token) {
case afs_opt_cell:
cell = afs_cell_lookup(args[0].from,
- args[0].to - args[0].from);
+ args[0].to - args[0].from,
+ false);
if (IS_ERR(cell))
return PTR_ERR(cell);
afs_put_cell(params->cell);
@@ -166,6 +171,10 @@ static int afs_parse_options(struct afs_mount_params *params,
*devname = args[0].from;
break;
+ case afs_opt_autocell:
+ params->autocell = 1;
+ break;
+
default:
printk(KERN_ERR "kAFS:"
" Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +261,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
/* lookup the cell record */
if (cellname || !params->cell) {
- cell = afs_cell_lookup(cellname, cellnamesz);
+ cell = afs_cell_lookup(cellname, cellnamesz, true);
if (IS_ERR(cell)) {
- printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
- cellname ?: "");
+ printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
+ cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
afs_put_cell(params->cell);
@@ -321,6 +330,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
if (IS_ERR(inode))
goto error_inode;
+ if (params->autocell)
+ set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
+
ret = -ENOMEM;
root = d_alloc_root(inode);
if (!root)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 52e59bf4aa5..f024d8aadde 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
return POLLERR;
}
-static int bad_file_ioctl (struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- return -EIO;
-}
-
static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
unsigned long arg)
{
@@ -159,7 +153,6 @@ static const struct file_operations bad_file_ops =
.aio_write = bad_file_aio_write,
.readdir = bad_file_readdir,
.poll = bad_file_poll,
- .ioctl = bad_file_ioctl,
.unlocked_ioctl = bad_file_unlocked_ioctl,
.compat_ioctl = bad_file_compat_ioctl,
.mmap = bad_file_mmap,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9e60fd20171..a7528b91393 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
Node *fmt;
struct file * interp_file = NULL;
char iname[BINPRM_BUF_SIZE];
- char *iname_addr = iname;
+ const char *iname_addr = iname;
int retval;
int fd_binary = -1;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb2..396a9884591 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
{
- char *cp, *i_name, *i_arg;
+ const char *i_arg, *i_name;
+ char *cp;
struct file *file;
char interp[BINPRM_BUF_SIZE];
int retval;
diff --git a/fs/buffer.c b/fs/buffer.c
index 50efa339e05..3e7dca279d1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
spin_unlock(lock);
/*
* Ensure any pending I/O completes so that
- * ll_rw_block() actually writes the current
- * contents - it is a noop if I/O is still in
- * flight on potentially older contents.
+ * write_dirty_buffer() actually writes the
+ * current contents - it is a noop if I/O is
+ * still in flight on potentially older
+ * contents.
*/
- ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+ write_dirty_buffer(bh, WRITE_SYNC_PLUG);
/*
* Kick off IO for the previous mapping. Note
@@ -2912,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
BUG_ON(buffer_unwritten(bh));
/*
- * Mask in barrier bit for a write (could be either a WRITE or a
- * WRITE_SYNC
- */
- if (buffer_ordered(bh) && (rw & WRITE))
- rw |= WRITE_BARRIER;
-
- /*
* Only clear out a write error when rewriting
*/
if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2956,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
* requests an I/O operation on them, either a %READ or a %WRITE. The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
- * are sent to disk. The fourth %READA option is described in the documentation
- * for generic_make_request() which ll_rw_block() calls.
+ * %READA option is described in the documentation for generic_make_request()
+ * which ll_rw_block() calls.
*
* This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
- * clean when doing a write request, and any buffer that appears to be
- * up-to-date when doing read request. Further it marks as clean buffers that
- * are processed for writing (the buffer cache won't assume that they are
- * actually clean until the buffer gets unlocked).
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
+ * request, and any buffer that appears to be up-to-date when doing read
+ * request. Further it marks as clean buffers that are processed for
+ * writing (the buffer cache won't assume that they are actually clean
+ * until the buffer gets unlocked).
*
* ll_rw_block sets b_end_io to simple completion handler that marks
* the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2987,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
- if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
- lock_buffer(bh);
- else if (!trylock_buffer(bh))
+ if (!trylock_buffer(bh))
continue;
-
- if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
- rw == SWRITE_SYNC_PLUG) {
+ if (rw == WRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- if (rw == SWRITE_SYNC)
- submit_bh(WRITE_SYNC, bh);
- else
- submit_bh(WRITE, bh);
+ submit_bh(WRITE, bh);
continue;
}
} else {
@@ -3016,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
}
EXPORT_SYMBOL(ll_rw_block);
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+ lock_buffer(bh);
+ if (!test_clear_buffer_dirty(bh)) {
+ unlock_buffer(bh);
+ return;
+ }
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
+
/*
* For a data-integrity writeout, we need to wait upon any in-progress I/O
* and then start new I/O and then wait upon it. The caller must have a ref on
* the buffer_head.
*/
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
{
int ret = 0;
@@ -3030,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(WRITE_SYNC, bh);
+ ret = submit_bh(rw, bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
@@ -3043,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
}
return ret;
}
+EXPORT_SYMBOL(__sync_dirty_buffer);
+
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+ return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
EXPORT_SYMBOL(sync_dirty_buffer);
/*
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226d..bd6bc1bde2d 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do { \
#define dbgprintk(FMT, ...) \
printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline void _dbprintk(const char *fmt, ...)
- __attribute__((format(printf, 1, 2)));
-static inline void _dbprintk(const char *fmt, ...)
-{
-}
-
#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do { \
} while (0)
#else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
#if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d0229..4cfce1ee31f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
/* dirty the head */
spin_lock(&inode->i_lock);
- if (ci->i_wrbuffer_ref_head == 0)
+ if (ci->i_head_snapc == NULL)
ci->i_head_snapc = ceph_get_snap_context(snapc);
++ci->i_wrbuffer_ref_head;
if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
-
- if (mapping_cap_account_dirty(mapping)) {
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- __inc_bdi_stat(mapping->backing_dev_info,
- BDI_RECLAIMABLE);
- task_io_account_write(PAGE_CACHE_SIZE);
- }
+ account_page_dirtied(page, page->mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
@@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
break;
}
}
- if (!snapc && ci->i_head_snapc) {
+ if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
dout(" head snapc %p has %d dirty pages\n",
snapc, ci->i_wrbuffer_ref_head);
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8..a2d002cbdec 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
th = get_ticket_handler(ac, service);
- if (!th) {
+ if (IS_ERR(th)) {
*pneed |= service;
continue;
}
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th =
get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
ceph_x_validate_tickets(ac, &need);
dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
return -ERANGE;
head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
- BUG_ON(!th);
ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
if (ret)
return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
case CEPHX_GET_PRINCIPAL_SESSION_KEY:
th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
- BUG_ON(!th);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
buf + sizeof(*head), end);
break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
void *end = p + sizeof(au->reply_buf);
th = get_ticket_handler(ac, au->service);
- if (!th)
- return -EIO; /* hrm! */
+ if (IS_ERR(th))
+ return PTR_ERR(th);
ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
if (ret < 0)
return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
- if (th && !IS_ERR(th))
+ if (!IS_ERR(th))
remove_ticket_handler(ac, th);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b0397..a2069b6680a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
gid_t gid;
struct ceph_mds_session *session;
u64 xattr_version = 0;
+ struct ceph_buffer *xattr_blob = NULL;
int delayed = 0;
u64 flush_tid = 0;
int i;
@@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
for (i = 0; i < CEPH_CAP_BITS; i++)
if (flushing & (1 << i))
ci->i_cap_flush_tid[i] = flush_tid;
+
+ follows = ci->i_head_snapc->seq;
+ } else {
+ follows = 0;
}
keep = cap->implemented;
@@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
mtime = inode->i_mtime;
atime = inode->i_atime;
time_warp_seq = ci->i_time_warp_seq;
- follows = ci->i_snap_realm->cached_context->seq;
uid = inode->i_uid;
gid = inode->i_gid;
mode = inode->i_mode;
- if (dropping & CEPH_CAP_XATTR_EXCL) {
+ if (flushing & CEPH_CAP_XATTR_EXCL) {
__ceph_build_xattrs_blob(ci);
- xattr_version = ci->i_xattrs.version + 1;
+ xattr_blob = ci->i_xattrs.blob;
+ xattr_version = ci->i_xattrs.version;
}
spin_unlock(&inode->i_lock);
@@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
size, max_size, &mtime, &atime, time_warp_seq,
- uid, gid, mode,
- xattr_version,
- (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+ uid, gid, mode, xattr_version, xattr_blob,
follows);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
@@ -1282,7 +1285,7 @@ retry:
&capsnap->mtime, &capsnap->atime,
capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
- 0, NULL,
+ capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows);
next_follows = capsnap->follows + 1;
@@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
- dout(" inode %p now dirty\n", &ci->vfs_inode);
+ if (!ci->i_head_snapc)
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+ ci->i_head_snapc);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
- if (!ci->i_wrbuffer_ref_head) {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
@@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = 1;
+ if (ci->i_wrbuffer_ref_head == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
} else {
BUG_ON(list_empty(&ci->i_dirty_item));
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718..6fd8b20a861 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
} else if (req->r_dentry) {
path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&pathbase, 0);
+ if (IS_ERR(path))
+ path = NULL;
spin_lock(&req->r_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_old_dentry) {
path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
&pathbase, 0);
+ if (IS_ERR(path))
+ path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
ceph_ino(req->r_old_dentry->d_parent->d_inode),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d552..6e4f43ff23e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
else
dentry->d_op = &ceph_snap_dentry_ops;
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+ di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
if (!di)
return -ENOMEM; /* oh well */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e39..e7cca414da0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
if (ci->i_files == 0 && ci->i_subdirs == 0 &&
ceph_snap(inode) == CEPH_NOSNAP &&
(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
dout(" marking %p complete (empty)\n", inode);
ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -1229,11 +1230,11 @@ retry_lookup:
in = dn->d_inode;
} else {
in = ceph_get_inode(parent->d_sb, vino);
- if (in == NULL) {
+ if (IS_ERR(in)) {
dout("new_inode badness\n");
d_delete(dn);
dput(dn);
- err = -ENOMEM;
+ err = PTR_ERR(in);
goto out;
}
dn = splice_dentry(dn, in, NULL);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454..ff4e753aae9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
length = fl->fl_end - fl->fl_start + 1;
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid, (u64)fl->fl_nspid,
+ (u64)fl->fl_pid,
+ (u64)(unsigned long)fl->fl_nspid,
lock_cmd, fl->fl_start,
length, wait);
if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
/* undo! This should only happen if the kernel detects
* local deadlock. */
ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid, (u64)fl->fl_nspid,
+ (u64)fl->fl_pid,
+ (u64)(unsigned long)fl->fl_nspid,
CEPH_LOCK_UNLOCK, fl->fl_start,
length, 0);
dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
length = fl->fl_end - fl->fl_start + 1;
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
+ file, (u64)fl->fl_pid,
+ (u64)(unsigned long)fl->fl_nspid,
lock_cmd, fl->fl_start,
length, wait);
if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, (u64)fl->fl_pid,
- (u64)fl->fl_nspid,
+ (u64)(unsigned long)fl->fl_nspid,
CEPH_LOCK_UNLOCK, fl->fl_start,
length, 0);
dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
+ cephlock->pid_namespace =
+ cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
switch (lock->fl_type) {
case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe3..f091b135178 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
*
* Called under mdsc->mutex.
*/
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+ while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ dentry = dentry->d_parent;
+ return dentry;
+}
+
static int __choose_mds(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_inode) {
inode = req->r_inode;
} else if (req->r_dentry) {
- if (req->r_dentry->d_inode) {
+ struct inode *dir = req->r_dentry->d_parent->d_inode;
+
+ if (dir->i_sb != mdsc->client->sb) {
+ /* not this fs! */
+ inode = req->r_dentry->d_inode;
+ } else if (ceph_snap(dir) != CEPH_NOSNAP) {
+ /* direct snapped/virtual snapdir requests
+ * based on parent dir inode */
+ struct dentry *dn =
+ get_nonsnap_parent(req->r_dentry->d_parent);
+ inode = dn->d_inode;
+ dout("__choose_mds using nonsnap parent %p\n", inode);
+ } else if (req->r_dentry->d_inode) {
+ /* dentry target */
inode = req->r_dentry->d_inode;
} else {
- inode = req->r_dentry->d_parent->d_inode;
+ /* dir + name */
+ inode = dir;
hash = req->r_dentry->d_name.hash;
is_hash = true;
}
}
+
dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
(int)hash, mode);
if (!inode)
@@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session,
pr_info("mds%d reconnect denied\n", session->s_mds);
remove_session_caps(session);
wake = 1; /* for good measure */
- complete_all(&mdsc->session_close_waiters);
+ wake_up_all(&mdsc->session_close_wq);
kick_requests(mdsc, mds);
break;
@@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- BUG_ON(err);
+ goto out_dput;
}
} else {
path = NULL;
@@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
err = ceph_pagelist_encode_string(pagelist, path, pathlen);
if (err)
- goto out;
+ goto out_free;
spin_lock(&inode->i_lock);
cap->seq = 0; /* reset cap seq */
@@ -2354,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
unlock_kernel();
}
-out:
+out_free:
kfree(path);
+out_dput:
dput(dentry);
return err;
}
@@ -2876,7 +2899,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
return -ENOMEM;
init_completion(&mdsc->safe_umount_waiters);
- init_completion(&mdsc->session_close_waiters);
+ init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->sessions = NULL;
mdsc->max_sessions = 0;
@@ -3021,6 +3044,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
}
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+ int i, n = 0;
+
+ if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ return true;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++)
+ if (mdsc->sessions[i])
+ n++;
+ mutex_unlock(&mdsc->mutex);
+ return n == 0;
+}
/*
* called after sb is ro.
@@ -3029,45 +3069,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
struct ceph_mds_session *session;
int i;
- int n;
struct ceph_client *client = mdsc->client;
- unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+ unsigned long timeout = client->mount_args->mount_timeout * HZ;
dout("close_sessions\n");
- mutex_lock(&mdsc->mutex);
-
/* close sessions */
- started = jiffies;
- while (time_before(jiffies, started + timeout)) {
- dout("closing sessions\n");
- n = 0;
- for (i = 0; i < mdsc->max_sessions; i++) {
- session = __ceph_lookup_mds_session(mdsc, i);
- if (!session)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&session->s_mutex);
- __close_session(mdsc, session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- mutex_lock(&mdsc->mutex);
- n++;
- }
- if (n == 0)
- break;
-
- if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
- break;
-
- dout("waiting for sessions to close\n");
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ session = __ceph_lookup_mds_session(mdsc, i);
+ if (!session)
+ continue;
mutex_unlock(&mdsc->mutex);
- wait_for_completion_timeout(&mdsc->session_close_waiters,
- timeout);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex);
}
+ mutex_unlock(&mdsc->mutex);
+
+ dout("waiting for sessions to close\n");
+ wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+ timeout);
/* tear down remaining sessions */
+ mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i]) {
session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3107,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_lock(&mdsc->mutex);
}
}
-
WARN_ON(!list_empty(&mdsc->cap_delay_list));
-
mutex_unlock(&mdsc->mutex);
ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e34..c98267ce6d2 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
struct mutex mutex; /* all nested structures */
struct ceph_mdsmap *mdsmap;
- struct completion safe_umount_waiters, session_close_waiters;
+ struct completion safe_umount_waiters;
+ wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c..dfced1dacbc 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
reqhead->reassert_version = req->r_reassert_version;
req->r_stamp = jiffies;
- list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+ list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
ceph_msg_get(req->r_request); /* send consumes a ref */
ceph_con_send(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badb..4868b9dcac5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
- int used;
+ int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
if (!capsnap) {
@@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
spin_lock(&inode->i_lock);
used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
if (__ceph_have_pending_cap_snap(ci)) {
/* there is no point in queuing multiple "pending" cap_snaps,
as no new writes are allowed to start when pending, so any
@@ -452,11 +453,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode);
kfree(capsnap);
- } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+ } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+ (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+ CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
struct ceph_snap_context *snapc = ci->i_head_snapc;
+ dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+ capsnap, snapc);
igrab(inode);
-
+
atomic_set(&capsnap->nref, 1);
capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
@@ -464,15 +469,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->follows = snapc->seq - 1;
capsnap->issued = __ceph_caps_issued(ci, NULL);
- capsnap->dirty = __ceph_caps_dirty(ci);
+ capsnap->dirty = dirty;
capsnap->mode = inode->i_mode;
capsnap->uid = inode->i_uid;
capsnap->gid = inode->i_gid;
- /* fixme? */
- capsnap->xattr_blob = NULL;
- capsnap->xattr_len = 0;
+ if (dirty & CEPH_CAP_XATTR_EXCL) {
+ __ceph_build_xattrs_blob(ci);
+ capsnap->xattr_blob =
+ ceph_buffer_get(ci->i_xattrs.blob);
+ capsnap->xattr_version = ci->i_xattrs.version;
+ } else {
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_version = 0;
+ }
/* dirty page count moved from _head to this cap_snap;
all subsequent writes page dirties occur _after_ this
@@ -480,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
ci->i_wrbuffer_ref_head = 0;
capsnap->context = snapc;
- ci->i_head_snapc = NULL;
+ ci->i_head_snapc =
+ ceph_get_snap_context(ci->i_snap_realm->cached_context);
+ dout(" new snapc is %p\n", ci->i_head_snapc);
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
@@ -539,6 +552,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
return 1; /* caller may want to ceph_flush_snaps */
}
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+ struct ceph_inode_info *ci;
+ struct inode *lastinode = NULL;
+ struct ceph_snap_realm *child;
+
+ dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_for_each_entry(ci, &realm->inodes_with_caps,
+ i_snap_realm_item) {
+ struct inode *inode = igrab(&ci->vfs_inode);
+ if (!inode)
+ continue;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+ lastinode = inode;
+ ceph_queue_cap_snap(ci);
+ spin_lock(&realm->inodes_with_caps_lock);
+ }
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+
+ dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+ list_for_each_entry(child, &realm->children, child_item)
+ queue_realm_cap_snaps(child);
+
+ dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
/*
* Parse and apply a snapblob "snap trace" from the MDS. This specifies
@@ -589,29 +637,8 @@ more:
*
* ...unless it's a snap deletion!
*/
- if (!deletion) {
- struct ceph_inode_info *ci;
- struct inode *lastinode = NULL;
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_for_each_entry(ci, &realm->inodes_with_caps,
- i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
- if (!inode)
- continue;
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
- lastinode = inode;
- ceph_queue_cap_snap(ci);
- spin_lock(&realm->inodes_with_caps_lock);
- }
- spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
- dout("update_snap_trace cap_snaps queued\n");
- }
-
+ if (!deletion)
+ queue_realm_cap_snaps(realm);
} else {
dout("update_snap_trace %llx %p seq %lld unchanged\n",
realm->ino, realm, realm->seq);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0d..c33897ae572 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
uid_t uid;
gid_t gid;
- void *xattr_blob;
- int xattr_len;
+ struct ceph_buffer *xattr_blob;
u64 xattr_version;
u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
- if (atomic_dec_and_test(&capsnap->nref))
+ if (atomic_dec_and_test(&capsnap->nref)) {
+ if (capsnap->xattr_blob)
+ ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
+ }
}
/*
@@ -342,7 +344,8 @@ struct ceph_inode_info {
unsigned i_cap_exporting_issued;
struct ceph_cap_reservation i_cap_migration_resv;
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
- struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
+ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
+ dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00..9578af610b7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = NULL;
ci->i_xattrs.dirty = false;
+ ci->i_xattrs.version++;
}
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb..0da1debd499 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,8 @@ config CIFS
tristate "CIFS support (advanced network filesystem, SMBFS successor)"
depends on INET
select NLS
+ select CRYPTO_MD5
+ select CRYPTO_ARC4
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/README b/fs/cifs/README
index a7081eeeb85..7099a526f77 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
gid Set the default gid for inodes (similar to above).
file_mode If CIFS Unix extensions are not supported by the server
this overrides the default mode for file inodes.
+ fsc Enable local disk caching using FS-Cache (off by default). This
+ option could be useful to improve performance on a slow link,
+ heavily loaded server and/or network where reading from the
+ disk is faster than reading from the server (over the network).
+ This could also impact scalability positively as the
+ number of calls to the server are reduced. However, local
+ caching is not suitable for all workloads for e.g. read-once
+ type workloads. So, you need to consider carefully your
+ workload/scenario before using this option. Currently, local
+ disk caching is functional for CIFS files opened as read-only.
dir_mode If CIFS Unix extensions are not supported by the server
this overrides the default mode for directory inodes.
port attempt to contact the server on this tcp port, before
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0b..21f0fbd8698 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
if (compare_oid(oid, oidlen, MSKRB5_OID,
MSKRB5_OID_LEN))
server->sec_mskerberos = true;
- else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+ if (compare_oid(oid, oidlen, KRB5U2U_OID,
KRB5U2U_OID_LEN))
server->sec_kerberosu2u = true;
- else if (compare_oid(oid, oidlen, KRB5_OID,
+ if (compare_oid(oid, oidlen, KRB5_OID,
KRB5_OID_LEN))
server->sec_kerberos = true;
- else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+ if (compare_oid(oid, oidlen, NTLMSSP_OID,
NTLMSSP_OID_LEN))
server->sec_ntlmssp = true;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6..7fe6b52df50 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
* This is a compressed table of upper and lower case conversion.
*
*/
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
#include <asm/byteorder.h>
#include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
#endif /* UNIUPR_NOUPPER */
#ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
#endif /* UNIUPR_NOLOWER */
#ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
* UniTolower: Convert a unicode character to lower case
*/
static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
{
- register struct UniCaseRange *rp;
+ register const struct UniCaseRange *rp;
- if (uc < sizeof(UniLowerTable)) {
+ if (uc < sizeof(CifsUniLowerTable)) {
/* Latin characters */
- return uc + UniLowerTable[uc]; /* Use base tables */
+ return uc + CifsUniLowerTable[uc]; /* Use base tables */
} else {
- rp = UniLowerRange; /* Use range tables */
+ rp = CifsUniLowerRange; /* Use range tables */
while (rp->start) {
if (uc < rp->start) /* Before start of range */
return uc; /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
}
#endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e51..0ac7c5a8633 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
/*
* Latin lower case
*/
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
/*
* Lower Case Range
*/
-static const struct UniCaseRange CifsUniLowerRange[] = {
- 0x0380, 0x03ab, UniCaseRangeL0380,
- 0x0400, 0x042f, UniCaseRangeL0400,
- 0x0490, 0x04cb, UniCaseRangeL0490,
- 0x1e00, 0x1ff7, UniCaseRangeL1e00,
- 0xff20, 0xff3a, UniCaseRangeLff20,
- 0, 0, 0
+const struct UniCaseRange CifsUniLowerRange[] = {
+ {0x0380, 0x03ab, UniCaseRangeL0380},
+ {0x0400, 0x042f, UniCaseRangeL0400},
+ {0x0490, 0x04cb, UniCaseRangeL0490},
+ {0x1e00, 0x1ff7, UniCaseRangeL1e00},
+ {0xff20, 0xff3a, UniCaseRangeLff20},
+ {0}
};
#endif
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc4..709f2296bdb 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
#include "md5.h"
#include "cifs_unicode.h"
#include "cifsproto.h"
+#include "ntlmssp.h"
#include <linux/ctype.h>
#include <linux/random.h>
@@ -42,21 +43,43 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
- const struct mac_key *key, char *signature)
+ struct TCP_Server_Info *server, char *signature)
{
- struct MD5Context context;
+ int rc;
- if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+ if (cifs_pdu == NULL || server == NULL || signature == NULL)
return -EINVAL;
- cifs_MD5_init(&context);
- cifs_MD5_update(&context, (char *)&key->data, key->len);
- cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+ if (!server->ntlmssp.sdescmd5) {
+ cERROR(1,
+ "cifs_calculate_signature: can't generate signature\n");
+ return -1;
+ }
- cifs_MD5_final(signature, &context);
- return 0;
+ rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
+ if (rc) {
+ cERROR(1, "cifs_calculate_signature: oould not init md5\n");
+ return rc;
+ }
+
+ if (server->secType == RawNTLMSSP)
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ server->session_key.data.ntlmv2.key,
+ CIFS_NTLMV2_SESSKEY_SIZE);
+ else
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ (char *)&server->session_key.data,
+ server->session_key.len);
+
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+
+ rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
+
+ return rc;
}
+
int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
__u32 *pexpected_response_sequence_number)
{
@@ -78,8 +101,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
server->sequence_number++;
spin_unlock(&GlobalMid_Lock);
- rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
- smb_signature);
+ rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
if (rc)
memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
else
@@ -89,21 +111,39 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
}
static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
- const struct mac_key *key, char *signature)
+ struct TCP_Server_Info *server, char *signature)
{
- struct MD5Context context;
int i;
+ int rc;
- if ((iov == NULL) || (signature == NULL) || (key == NULL))
+ if (iov == NULL || server == NULL || signature == NULL)
return -EINVAL;
- cifs_MD5_init(&context);
- cifs_MD5_update(&context, (char *)&key->data, key->len);
+ if (!server->ntlmssp.sdescmd5) {
+ cERROR(1, "cifs_calc_signature2: can't generate signature\n");
+ return -1;
+ }
+
+ rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
+ if (rc) {
+ cERROR(1, "cifs_calc_signature2: oould not init md5\n");
+ return rc;
+ }
+
+ if (server->secType == RawNTLMSSP)
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ server->session_key.data.ntlmv2.key,
+ CIFS_NTLMV2_SESSKEY_SIZE);
+ else
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ (char *)&server->session_key.data,
+ server->session_key.len);
+
for (i = 0; i < n_vec; i++) {
if (iov[i].iov_len == 0)
continue;
if (iov[i].iov_base == NULL) {
- cERROR(1, "null iovec entry");
+ cERROR(1, "cifs_calc_signature2: null iovec entry");
return -EIO;
}
/* The first entry includes a length field (which does not get
@@ -111,18 +151,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
if (i == 0) {
if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
break; /* nothing to sign or corrupt header */
- cifs_MD5_update(&context, iov[0].iov_base+4,
- iov[0].iov_len-4);
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ iov[i].iov_base + 4, iov[i].iov_len - 4);
} else
- cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+ crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
+ iov[i].iov_base, iov[i].iov_len);
}
- cifs_MD5_final(signature, &context);
+ rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
- return 0;
+ return rc;
}
-
int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
__u32 *pexpected_response_sequence_number)
{
@@ -145,8 +185,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
server->sequence_number++;
spin_unlock(&GlobalMid_Lock);
- rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
- smb_signature);
+ rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
if (rc)
memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
else
@@ -156,14 +195,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
}
int cifs_verify_signature(struct smb_hdr *cifs_pdu,
- const struct mac_key *mac_key,
+ struct TCP_Server_Info *server,
__u32 expected_sequence_number)
{
- unsigned int rc;
+ int rc;
char server_response_sig[8];
char what_we_think_sig_should_be[20];
- if ((cifs_pdu == NULL) || (mac_key == NULL))
+ if (cifs_pdu == NULL || server == NULL)
return -EINVAL;
if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +231,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
cpu_to_le32(expected_sequence_number);
cifs_pdu->Signature.Sequence.Reserved = 0;
- rc = cifs_calculate_signature(cifs_pdu, mac_key,
+ rc = cifs_calculate_signature(cifs_pdu, server,
what_we_think_sig_should_be);
if (rc)
@@ -209,7 +248,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
}
/* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int cifs_calculate_session_key(struct session_key *key, const char *rn,
const char *password)
{
char temp_key[16];
@@ -223,63 +262,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
return 0;
}
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
- const struct nls_table *nls_info)
-{
- char temp_hash[16];
- struct HMACMD5Context ctx;
- char *ucase_buf;
- __le16 *unicode_buf;
- unsigned int i, user_name_len, dom_name_len;
-
- if (ses == NULL)
- return -EINVAL;
-
- E_md4hash(ses->password, temp_hash);
-
- hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
- user_name_len = strlen(ses->userName);
- if (user_name_len > MAX_USERNAME_SIZE)
- return -EINVAL;
- if (ses->domainName == NULL)
- return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
- dom_name_len = strlen(ses->domainName);
- if (dom_name_len > MAX_USERNAME_SIZE)
- return -EINVAL;
-
- ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
- if (ucase_buf == NULL)
- return -ENOMEM;
- unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
- if (unicode_buf == NULL) {
- kfree(ucase_buf);
- return -ENOMEM;
- }
-
- for (i = 0; i < user_name_len; i++)
- ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
- ucase_buf[i] = 0;
- user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
- MAX_USERNAME_SIZE*2, nls_info);
- unicode_buf[user_name_len] = 0;
- user_name_len++;
-
- for (i = 0; i < dom_name_len; i++)
- ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
- ucase_buf[i] = 0;
- dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
- MAX_USERNAME_SIZE*2, nls_info);
-
- unicode_buf[user_name_len + dom_name_len] = 0;
- hmac_md5_update((const unsigned char *) unicode_buf,
- (user_name_len+dom_name_len)*2, &ctx);
-
- hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
- kfree(ucase_buf);
- kfree(unicode_buf);
- return 0;
-}
-
#ifdef CONFIG_CIFS_WEAK_PW_HASH
void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
char *lnm_session_key)
@@ -324,38 +306,52 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
{
int rc = 0;
int len;
- char nt_hash[16];
- struct HMACMD5Context *pctxt;
+ char nt_hash[CIFS_NTHASH_SIZE];
wchar_t *user;
wchar_t *domain;
+ wchar_t *server;
- pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
-
- if (pctxt == NULL)
- return -ENOMEM;
+ if (!ses->server->ntlmssp.sdeschmacmd5) {
+ cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+ return -1;
+ }
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash);
- /* convert Domainname to unicode and uppercase */
- hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+ crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash,
+ CIFS_NTHASH_SIZE);
+
+ rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
+ if (rc) {
+ cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
+ return rc;
+ }
/* convert ses->userName to unicode and uppercase */
len = strlen(ses->userName);
user = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (user == NULL)
+ if (user == NULL) {
+ cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+ rc = -ENOMEM;
goto calc_exit_2;
+ }
len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
UniStrupr(user);
- hmac_md5_update((char *)user, 2*len, pctxt);
+
+ crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ (char *)user, 2 * len);
/* convert ses->domainName to unicode and uppercase */
if (ses->domainName) {
len = strlen(ses->domainName);
domain = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (domain == NULL)
+ if (domain == NULL) {
+ cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+ rc = -ENOMEM;
goto calc_exit_1;
+ }
len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
nls_cp);
/* the following line was removed since it didn't work well
@@ -363,65 +359,292 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
Maybe converting the domain name earlier makes sense */
/* UniStrupr(domain); */
- hmac_md5_update((char *)domain, 2*len, pctxt);
+ crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ (char *)domain, 2 * len);
kfree(domain);
+ } else if (ses->serverName) {
+ len = strlen(ses->serverName);
+
+ server = kmalloc(2 + (len * 2), GFP_KERNEL);
+ if (server == NULL) {
+ cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+ rc = -ENOMEM;
+ goto calc_exit_1;
+ }
+ len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+ nls_cp);
+ /* the following line was removed since it didn't work well
+ with lower cased domain name that passed as an option.
+ Maybe converting the domain name earlier makes sense */
+ /* UniStrupr(domain); */
+
+ crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ (char *)server, 2 * len);
+
+ kfree(server);
}
+
+ rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ ses->server->ntlmv2_hash);
+
calc_exit_1:
kfree(user);
calc_exit_2:
/* BB FIXME what about bytes 24 through 40 of the signing key?
compare with the NTLM example */
- hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
- kfree(pctxt);
return rc;
}
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
- const struct nls_table *nls_cp)
+static int
+find_domain_name(struct cifsSesInfo *ses)
+{
+ int rc = 0;
+ unsigned int attrsize;
+ unsigned int type;
+ unsigned char *blobptr;
+ struct ntlmssp2_name *attrptr;
+
+ if (ses->server->tiblob) {
+ blobptr = ses->server->tiblob;
+ attrptr = (struct ntlmssp2_name *) blobptr;
+
+ while ((type = attrptr->type) != 0) {
+ blobptr += 2; /* advance attr type */
+ attrsize = attrptr->length;
+ blobptr += 2; /* advance attr size */
+ if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+ if (!ses->domainName) {
+ ses->domainName =
+ kmalloc(attrptr->length + 1,
+ GFP_KERNEL);
+ if (!ses->domainName)
+ return -ENOMEM;
+ cifs_from_ucs2(ses->domainName,
+ (__le16 *)blobptr,
+ attrptr->length,
+ attrptr->length,
+ load_nls_default(), false);
+ }
+ }
+ blobptr += attrsize; /* advance attr value */
+ attrptr = (struct ntlmssp2_name *) blobptr;
+ }
+ } else {
+ ses->server->tilen = 2 * sizeof(struct ntlmssp2_name);
+ ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL);
+ if (!ses->server->tiblob) {
+ ses->server->tilen = 0;
+ cERROR(1, "Challenge target info allocation failure");
+ return -ENOMEM;
+ }
+ memset(ses->server->tiblob, 0x0, ses->server->tilen);
+ attrptr = (struct ntlmssp2_name *) ses->server->tiblob;
+ attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
+ }
+
+ return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct TCP_Server_Info *server,
+ char *v2_session_response)
{
int rc;
+
+ if (!server->ntlmssp.sdeschmacmd5) {
+ cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+ return -1;
+ }
+
+ crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash,
+ CIFS_HMAC_MD5_HASH_SIZE);
+
+ rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash);
+ if (rc) {
+ cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+ return rc;
+ }
+
+ memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+ server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
+ crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
+ v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+ sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE);
+
+ if (server->tilen)
+ crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
+ server->tiblob, server->tilen);
+
+ rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash,
+ v2_session_response);
+
+ return rc;
+}
+
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+ const struct nls_table *nls_cp)
+{
+ int rc = 0;
struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
- struct HMACMD5Context context;
buf->blob_signature = cpu_to_le32(0x00000101);
buf->reserved = 0;
buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
buf->reserved2 = 0;
- buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
- buf->names[0].length = 0;
- buf->names[1].type = 0;
- buf->names[1].length = 0;
+
+ if (!ses->domainName) {
+ rc = find_domain_name(ses);
+ if (rc) {
+ cERROR(1, "could not get domain/server name rc %d", rc);
+ return rc;
+ }
+ }
/* calculate buf->ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, nls_cp);
- if (rc)
+ if (rc) {
cERROR(1, "could not get v2 hash rc %d", rc);
- CalcNTLMv2_response(ses, resp_buf);
+ return rc;
+ }
+ rc = CalcNTLMv2_response(ses->server, resp_buf);
+ if (rc) {
+ cERROR(1, "could not get v2 hash rc %d", rc);
+ return rc;
+ }
- /* now calculate the MAC key for NTLMv2 */
- hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
- hmac_md5_update(resp_buf, 16, &context);
- hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+ if (!ses->server->ntlmssp.sdeschmacmd5) {
+ cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+ return -1;
+ }
- memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
- sizeof(struct ntlmv2_resp));
- ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+ crypto_shash_setkey(ses->server->ntlmssp.hmacmd5,
+ ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+ rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
+ if (rc) {
+ cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n");
+ return rc;
+ }
+
+ crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
+
+ rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
+ ses->server->session_key.data.ntlmv2.key);
+
+ memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
+ sizeof(struct ntlmv2_resp));
+ ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
+
+ return rc;
}
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
- char *v2_session_response)
+int
+calc_seckey(struct TCP_Server_Info *server)
{
- struct HMACMD5Context context;
- /* rest of v2 struct already generated */
- memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
- hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+ int rc;
+ unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE];
+ struct crypto_blkcipher *tfm_arc4;
+ struct scatterlist sgin, sgout;
+ struct blkcipher_desc desc;
+
+ get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
+
+ tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)",
+ 0, CRYPTO_ALG_ASYNC);
+ if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+ cERROR(1, "could not allocate " "master crypto API arc4\n");
+ return 1;
+ }
+
+ desc.tfm = tfm_arc4;
+
+ crypto_blkcipher_setkey(tfm_arc4,
+ server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE);
+ sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE);
+ sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+ rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
- hmac_md5_update(v2_session_response+8,
- sizeof(struct ntlmv2_resp) - 8, &context);
+ if (!rc)
+ memcpy(server->session_key.data.ntlmv2.key,
+ sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
+
+ crypto_free_blkcipher(tfm_arc4);
+
+ return 0;
+}
- hmac_md5_final(v2_session_response, &context);
-/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+ if (server->ntlmssp.md5)
+ crypto_free_shash(server->ntlmssp.md5);
+
+ if (server->ntlmssp.hmacmd5)
+ crypto_free_shash(server->ntlmssp.hmacmd5);
+
+ kfree(server->ntlmssp.sdeschmacmd5);
+
+ kfree(server->ntlmssp.sdescmd5);
+}
+
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+ int rc;
+ unsigned int size;
+
+ server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+ if (!server->ntlmssp.hmacmd5 ||
+ IS_ERR(server->ntlmssp.hmacmd5)) {
+ cERROR(1, "could not allocate crypto hmacmd5\n");
+ return 1;
+ }
+
+ server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0);
+ if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) {
+ cERROR(1, "could not allocate crypto md5\n");
+ rc = 1;
+ goto cifs_crypto_shash_allocate_ret1;
+ }
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->ntlmssp.hmacmd5);
+ server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->ntlmssp.sdeschmacmd5) {
+ cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+ rc = -ENOMEM;
+ goto cifs_crypto_shash_allocate_ret2;
+ }
+ server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5;
+ server->ntlmssp.sdeschmacmd5->shash.flags = 0x0;
+
+
+ size = sizeof(struct shash_desc) +
+ crypto_shash_descsize(server->ntlmssp.md5);
+ server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL);
+ if (!server->ntlmssp.sdescmd5) {
+ cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+ rc = -ENOMEM;
+ goto cifs_crypto_shash_allocate_ret3;
+ }
+ server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5;
+ server->ntlmssp.sdescmd5->shash.flags = 0x0;
+
+ return 0;
+
+cifs_crypto_shash_allocate_ret3:
+ kfree(server->ntlmssp.sdeschmacmd5);
+
+cifs_crypto_shash_allocate_ret2:
+ crypto_free_shash(server->ntlmssp.md5);
+
+cifs_crypto_shash_allocate_ret1:
+ crypto_free_shash(server->ntlmssp.hmacmd5);
+
+ return rc;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac..c9d0cfc086e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
#include <linux/workqueue.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+
/*
* The sizes of various internal tables and strings
*/
@@ -97,7 +100,7 @@ enum protocolEnum {
/* Netbios frames protocol not supported at this time */
};
-struct mac_key {
+struct session_key {
unsigned int len;
union {
char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -120,6 +123,21 @@ struct cifs_cred {
struct cifs_ace *aces;
};
+struct sdesc {
+ struct shash_desc shash;
+ char ctx[];
+};
+
+struct ntlmssp_auth {
+ __u32 client_flags;
+ __u32 server_flags;
+ unsigned char ciphertext[CIFS_CPHTXT_SIZE];
+ struct crypto_shash *hmacmd5;
+ struct crypto_shash *md5;
+ struct sdesc *sdeschmacmd5;
+ struct sdesc *sdescmd5;
+};
+
/*
*****************************************************************
* Except the CIFS PDUs themselves all the
@@ -182,11 +200,14 @@ struct TCP_Server_Info {
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* needed for CIFS PDU signature */
- struct mac_key mac_signing_key;
+ struct session_key session_key;
char ntlmv2_hash[16];
unsigned long lstrp; /* when we got last response from this server */
u16 dialect; /* dialect index that server chose */
/* extended security flavors that server supports */
+ unsigned int tilen; /* length of the target info blob */
+ unsigned char *tiblob; /* target info blob in challenge response */
+ struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
bool sec_kerberosu2u; /* supports U2U Kerberos */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db1..320e0fd0ba7 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -134,6 +134,12 @@
* Size of the session key (crypto key encrypted with the password
*/
#define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTLMV2_SESSKEY_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
/*
* Maximum user name length
@@ -663,7 +669,6 @@ struct ntlmv2_resp {
__le64 time;
__u64 client_chal; /* random */
__u32 reserved2;
- struct ntlmssp2_name names[2];
/* array of name entries could follow ending in minimum 4 byte struct */
} __attribute__((packed));
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f545081408..1378d913384 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
__u32 *);
extern int cifs_verify_signature(struct smb_hdr *,
- const struct mac_key *mac_key,
+ struct TCP_Server_Info *server,
__u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
- const struct nls_table *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct TCP_Server_Info *);
#ifdef CONFIG_CIFS_WEAK_PW_HASH
extern void calc_lanman_hash(const char *password, const char *cryptkey,
bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd3..4bda920d1f7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -604,11 +604,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
else
rc = -EINVAL;
- if (server->sec_kerberos || server->sec_mskerberos)
- server->secType = Kerberos;
- else if (server->sec_ntlmssp)
- server->secType = RawNTLMSSP;
- else
+ if (server->secType == Kerberos) {
+ if (!server->sec_kerberos &&
+ !server->sec_mskerberos)
+ rc = -EOPNOTSUPP;
+ } else if (server->secType == RawNTLMSSP) {
+ if (!server->sec_ntlmssp)
+ rc = -EOPNOTSUPP;
+ } else
rc = -EOPNOTSUPP;
}
} else
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edf..ec0ea4a43bd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1673,7 +1673,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
MAX_USERNAME_SIZE))
continue;
if (strlen(vol->username) != 0 &&
- strncmp(ses->password, vol->password,
+ ses->password != NULL &&
+ strncmp(ses->password,
+ vol->password ? vol->password : "",
MAX_PASSWORD_SIZE))
continue;
}
@@ -1706,6 +1708,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
CIFSSMBLogoff(xid, ses);
_FreeXid(xid);
}
+ cifs_crypto_shash_release(server);
sesInfoFree(ses);
cifs_put_tcp_session(server);
}
@@ -1785,13 +1788,23 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->linux_uid = volume_info->linux_uid;
ses->overrideSecFlg = volume_info->secFlg;
+ rc = cifs_crypto_shash_allocate(server);
+ if (rc) {
+ cERROR(1, "could not setup hash structures rc %d", rc);
+ goto get_ses_fail;
+ }
+ server->tilen = 0;
+ server->tiblob = NULL;
+
mutex_lock(&ses->session_mutex);
rc = cifs_negotiate_protocol(xid, ses);
if (!rc)
rc = cifs_setup_session(xid, ses, volume_info->local_nls);
mutex_unlock(&ses->session_mutex);
- if (rc)
+ if (rc) {
+ cifs_crypto_shash_release(ses->server);
goto get_ses_fail;
+ }
/* success, put it on the list */
write_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46..f9ed0751cc1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
full_path = build_path_from_dentry(direntry);
if (full_path == NULL) {
rc = -ENOMEM;
- FreeXid(xid);
- return rc;
+ goto cifs_create_out;
}
if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
if (buf == NULL) {
- kfree(full_path);
- FreeXid(xid);
- return -ENOMEM;
+ rc = -ENOMEM;
+ goto cifs_create_out;
}
/*
@@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
struct cifsTconInfo *pTcon;
char *full_path = NULL;
struct inode *newinode = NULL;
+ int oplock = 0;
+ u16 fileHandle;
+ FILE_ALL_INFO *buf = NULL;
+ unsigned int bytes_written;
+ struct win_dev *pdev;
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
pTcon = cifs_sb->tcon;
full_path = build_path_from_dentry(direntry);
- if (full_path == NULL)
+ if (full_path == NULL) {
rc = -ENOMEM;
- else if (pTcon->unix_ext) {
+ goto mknod_out;
+ }
+
+ if (pTcon->unix_ext) {
struct cifs_unix_set_info_args args = {
.mode = mode & ~current_umask(),
.ctime = NO_CHANGE_64,
@@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (rc)
+ goto mknod_out;
- if (!rc) {
- rc = cifs_get_inode_info_unix(&newinode, full_path,
+ rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
- if (pTcon->nocase)
- direntry->d_op = &cifs_ci_dentry_ops;
- else
- direntry->d_op = &cifs_dentry_ops;
- if (rc == 0)
- d_instantiate(direntry, newinode);
- }
- } else {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
- int oplock = 0;
- u16 fileHandle;
- FILE_ALL_INFO *buf;
+ if (pTcon->nocase)
+ direntry->d_op = &cifs_ci_dentry_ops;
+ else
+ direntry->d_op = &cifs_dentry_ops;
- cFYI(1, "sfu compat create special file");
+ if (rc == 0)
+ d_instantiate(direntry, newinode);
+ goto mknod_out;
+ }
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- kfree(full_path);
- rc = -ENOMEM;
- FreeXid(xid);
- return rc;
- }
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+ goto mknod_out;
- rc = CIFSSMBOpen(xid, pTcon, full_path,
- FILE_CREATE, /* fail if exists */
- GENERIC_WRITE /* BB would
- WRITE_OWNER | WRITE_DAC be better? */,
- /* Create a file and set the
- file attribute to SYSTEM */
- CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
- &fileHandle, &oplock, buf,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
-
- /* BB FIXME - add handling for backlevel servers
- which need legacy open and check for all
- calls to SMBOpen for fallback to SMBLeagcyOpen */
- if (!rc) {
- /* BB Do not bother to decode buf since no
- local inode yet to put timestamps in,
- but we can reuse it safely */
- unsigned int bytes_written;
- struct win_dev *pdev;
- pdev = (struct win_dev *)buf;
- if (S_ISCHR(mode)) {
- memcpy(pdev->type, "IntxCHR", 8);
- pdev->major =
- cpu_to_le64(MAJOR(device_number));
- pdev->minor =
- cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, pTcon,
- fileHandle,
- sizeof(struct win_dev),
- 0, &bytes_written, (char *)pdev,
- NULL, 0);
- } else if (S_ISBLK(mode)) {
- memcpy(pdev->type, "IntxBLK", 8);
- pdev->major =
- cpu_to_le64(MAJOR(device_number));
- pdev->minor =
- cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, pTcon,
- fileHandle,
- sizeof(struct win_dev),
- 0, &bytes_written, (char *)pdev,
- NULL, 0);
- } /* else if(S_ISFIFO */
- CIFSSMBClose(xid, pTcon, fileHandle);
- d_drop(direntry);
- }
- kfree(buf);
- /* add code here to set EAs */
- }
+
+ cFYI(1, "sfu compat create special file");
+
+ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (buf == NULL) {
+ kfree(full_path);
+ rc = -ENOMEM;
+ FreeXid(xid);
+ return rc;
}
+ /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+ rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+ GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+ &fileHandle, &oplock, buf, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (rc)
+ goto mknod_out;
+
+ /* BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely */
+
+ pdev = (struct win_dev *)buf;
+ if (S_ISCHR(mode)) {
+ memcpy(pdev->type, "IntxCHR", 8);
+ pdev->major =
+ cpu_to_le64(MAJOR(device_number));
+ pdev->minor =
+ cpu_to_le64(MINOR(device_number));
+ rc = CIFSSMBWrite(xid, pTcon,
+ fileHandle,
+ sizeof(struct win_dev),
+ 0, &bytes_written, (char *)pdev,
+ NULL, 0);
+ } else if (S_ISBLK(mode)) {
+ memcpy(pdev->type, "IntxBLK", 8);
+ pdev->major =
+ cpu_to_le64(MAJOR(device_number));
+ pdev->minor =
+ cpu_to_le64(MINOR(device_number));
+ rc = CIFSSMBWrite(xid, pTcon,
+ fileHandle,
+ sizeof(struct win_dev),
+ 0, &bytes_written, (char *)pdev,
+ NULL, 0);
+ } /* else if (S_ISFIFO) */
+ CIFSSMBClose(xid, pTcon, fileHandle);
+ d_drop(direntry);
+
+ /* FIXME: add code here to set EAs */
+
+mknod_out:
kfree(full_path);
+ kfree(buf);
FreeXid(xid);
return rc;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e9..de748c652d1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
full_path = build_path_from_dentry(file->f_path.dentry);
if (full_path == NULL) {
rc = -ENOMEM;
- FreeXid(xid);
- return rc;
+ goto out;
}
cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f2..86a164f08a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
xid, NULL);
if (!inode)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(rc);
#ifdef CONFIG_CIFS_FSCACHE
/* populate tcon->resource_id */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e7531..1db0f0746a5 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,19 @@
#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
#define NTLMSSP_NEGOTIATE_56 0x80000000
+/* Define AV Pair Field IDs */
+#define NTLMSSP_AV_EOL 0
+#define NTLMSSP_AV_NB_COMPUTER_NAME 1
+#define NTLMSSP_AV_NB_DOMAIN_NAME 2
+#define NTLMSSP_AV_DNS_COMPUTER_NAME 3
+#define NTLMSSP_AV_DNS_DOMAIN_NAME 4
+#define NTLMSSP_AV_DNS_TREE_NAME 5
+#define NTLMSSP_AV_FLAGS 6
+#define NTLMSSP_AV_TIMESTAMP 7
+#define NTLMSSP_AV_RESTRICTION 8
+#define NTLMSSP_AV_TARGET_NAME 9
+#define NTLMSSP_AV_CHANNEL_BINDINGS 10
+
/* Although typedefs are not commonly used for structure definitions */
/* in the Linux kernel, in this particular case they are useful */
/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5d..795095f4eac 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
struct cifsSesInfo *ses)
{
+ unsigned int tioffset; /* challeng message target info area */
+ unsigned int tilen; /* challeng message target info area length */
+
CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -405,6 +408,20 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
/* BB spec says that if AvId field of MsvAvTimestamp is populated then
we must set the MIC field of the AUTHENTICATE_MESSAGE */
+ ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
+
+ tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+ tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+ ses->server->tilen = tilen;
+ if (tilen) {
+ ses->server->tiblob = kmalloc(tilen, GFP_KERNEL);
+ if (!ses->server->tiblob) {
+ cERROR(1, "Challenge target info allocation failure");
+ return -ENOMEM;
+ }
+ memcpy(ses->server->tiblob, bcc_ptr + tioffset, tilen);
+ }
+
return 0;
}
@@ -425,12 +442,13 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
/* BB is NTLMV2 session security format easier to use here? */
flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
- NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+ NTLMSSP_NEGOTIATE_NTLM;
if (ses->server->secMode &
- (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
- flags |= NTLMSSP_NEGOTIATE_SIGN;
- if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
- flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+ (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+ flags |= NTLMSSP_NEGOTIATE_SIGN |
+ NTLMSSP_NEGOTIATE_KEY_XCH |
+ NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+ }
sec_blob->NegotiateFlags |= cpu_to_le32(flags);
@@ -451,10 +469,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
struct cifsSesInfo *ses,
const struct nls_table *nls_cp, bool first)
{
+ int rc;
+ unsigned int size;
AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
__u32 flags;
unsigned char *tmp;
- char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+ struct ntlmv2_resp ntlmv2_response = {};
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
@@ -477,19 +497,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
sec_blob->LmChallengeResponse.Length = 0;
sec_blob->LmChallengeResponse.MaximumLength = 0;
- /* calculate session key, BB what about adding similar ntlmv2 path? */
- SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
- if (first)
- cifs_calculate_mac_key(&ses->server->mac_signing_key,
- ntlm_session_key, ses->password);
-
- memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
- sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
- sec_blob->NtChallengeResponse.MaximumLength =
- cpu_to_le16(CIFS_SESS_KEY_SIZE);
+ rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
+ if (rc) {
+ cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc);
+ goto setup_ntlmv2_ret;
+ }
+ size = sizeof(struct ntlmv2_resp);
+ memcpy(tmp, (char *)&ntlmv2_response, size);
+ tmp += size;
+ if (ses->server->tilen > 0) {
+ memcpy(tmp, ses->server->tiblob, ses->server->tilen);
+ tmp += ses->server->tilen;
+ } else
+ ses->server->tilen = 0;
- tmp += CIFS_SESS_KEY_SIZE;
+ sec_blob->NtChallengeResponse.Length = cpu_to_le16(size +
+ ses->server->tilen);
+ sec_blob->NtChallengeResponse.MaximumLength =
+ cpu_to_le16(size + ses->server->tilen);
if (ses->domainName == NULL) {
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +527,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
- len += 2; /* trailing null */
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->DomainName.Length = cpu_to_le16(len);
sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +543,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
- len += 2; /* trailing null */
sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->UserName.Length = cpu_to_le16(len);
sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,9 +554,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
sec_blob->WorkstationName.MaximumLength = 0;
tmp += 2;
- sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
- sec_blob->SessionKey.Length = 0;
- sec_blob->SessionKey.MaximumLength = 0;
+ if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+ !calc_seckey(ses->server)) {
+ memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+ sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+ sec_blob->SessionKey.MaximumLength =
+ cpu_to_le16(CIFS_CPHTXT_SIZE);
+ tmp += CIFS_CPHTXT_SIZE;
+ } else {
+ sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+ sec_blob->SessionKey.Length = 0;
+ sec_blob->SessionKey.MaximumLength = 0;
+ }
+
+ ses->server->sequence_number = 0;
+
+setup_ntlmv2_ret:
+ if (ses->server->tilen > 0)
+ kfree(ses->server->tiblob);
+
return tmp - pbuffer;
}
@@ -546,15 +587,14 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
return;
}
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
+static int setup_ntlmssp_auth_req(char *ntlmsspblob,
struct cifsSesInfo *ses,
const struct nls_table *nls, bool first_time)
{
int bloblen;
- bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
+ bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls,
first_time);
- pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
return bloblen;
}
@@ -690,7 +730,7 @@ ssetup_ntlmssp_authenticate:
if (first_time) /* should this be moved into common code
with similar ntlmv2 path? */
- cifs_calculate_mac_key(&ses->server->mac_signing_key,
+ cifs_calculate_session_key(&ses->server->session_key,
ntlm_session_key, ses->password);
/* copy session key */
@@ -729,12 +769,21 @@ ssetup_ntlmssp_authenticate:
cpu_to_le16(sizeof(struct ntlmv2_resp));
/* calculate session key */
- setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+ rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+ if (rc) {
+ kfree(v2_sess_key);
+ goto ssetup_exit;
+ }
/* FIXME: calculate MAC key */
memcpy(bcc_ptr, (char *)v2_sess_key,
sizeof(struct ntlmv2_resp));
bcc_ptr += sizeof(struct ntlmv2_resp);
kfree(v2_sess_key);
+ if (ses->server->tilen > 0) {
+ memcpy(bcc_ptr, ses->server->tiblob,
+ ses->server->tilen);
+ bcc_ptr += ses->server->tilen;
+ }
if (ses->capabilities & CAP_UNICODE) {
if (iov[0].iov_len % 2) {
*bcc_ptr = 0;
@@ -765,15 +814,15 @@ ssetup_ntlmssp_authenticate:
}
/* bail out if key is too long */
if (msg->sesskey_len >
- sizeof(ses->server->mac_signing_key.data.krb5)) {
+ sizeof(ses->server->session_key.data.krb5)) {
cERROR(1, "Kerberos signing key too long (%u bytes)",
msg->sesskey_len);
rc = -EOVERFLOW;
goto ssetup_exit;
}
if (first_time) {
- ses->server->mac_signing_key.len = msg->sesskey_len;
- memcpy(ses->server->mac_signing_key.data.krb5,
+ ses->server->session_key.len = msg->sesskey_len;
+ memcpy(ses->server->session_key.data.krb5,
msg->data, msg->sesskey_len);
}
pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -815,12 +864,28 @@ ssetup_ntlmssp_authenticate:
if (phase == NtLmNegotiate) {
setup_ntlmssp_neg_req(pSMB, ses);
iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+ iov[1].iov_base = &pSMB->req.SecurityBlob[0];
} else if (phase == NtLmAuthenticate) {
int blob_len;
- blob_len = setup_ntlmssp_auth_req(pSMB, ses,
- nls_cp,
- first_time);
+ char *ntlmsspblob;
+
+ ntlmsspblob = kmalloc(5 *
+ sizeof(struct _AUTHENTICATE_MESSAGE),
+ GFP_KERNEL);
+ if (!ntlmsspblob) {
+ cERROR(1, "Can't allocate NTLMSSP");
+ rc = -ENOMEM;
+ goto ssetup_exit;
+ }
+
+ blob_len = setup_ntlmssp_auth_req(ntlmsspblob,
+ ses,
+ nls_cp,
+ first_time);
iov[1].iov_len = blob_len;
+ iov[1].iov_base = ntlmsspblob;
+ pSMB->req.SecurityBlobLength =
+ cpu_to_le16(blob_len);
/* Make sure that we tell the server that we
are using the uid that it just gave us back
on the response (challenge) */
@@ -830,7 +895,6 @@ ssetup_ntlmssp_authenticate:
rc = -ENOSYS;
goto ssetup_exit;
}
- iov[1].iov_base = &pSMB->req.SecurityBlob[0];
/* unicode strings must be word aligned */
if ((iov[0].iov_len + iov[1].iov_len) % 2) {
*bcc_ptr = 0;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d697..e0588cdf4cc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
(ses->server->secMode & (SECMODE_SIGN_REQUIRED |
SECMODE_SIGN_ENABLED))) {
rc = cifs_verify_signature(midQ->resp_buf,
- &ses->server->mac_signing_key,
+ ses->server,
midQ->sequence_number+1);
if (rc) {
cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
(ses->server->secMode & (SECMODE_SIGN_REQUIRED |
SECMODE_SIGN_ENABLED))) {
rc = cifs_verify_signature(out_buf,
- &ses->server->mac_signing_key,
+ ses->server,
midQ->sequence_number+1);
if (rc) {
cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
(ses->server->secMode & (SECMODE_SIGN_REQUIRED |
SECMODE_SIGN_ENABLED))) {
rc = cifs_verify_signature(out_buf,
- &ses->server->mac_signing_key,
+ ses->server,
midQ->sequence_number+1);
if (rc) {
cERROR(1, "Unexpected SMB signature");
diff --git a/fs/compat.c b/fs/compat.c
index e6d5d70cf3c..718c7062aec 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -77,7 +77,8 @@ int compat_printk(const char *fmt, ...)
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
*/
-asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
+asmlinkage long compat_sys_utime(const char __user *filename,
+ struct compat_utimbuf __user *t)
{
struct timespec tv[2];
@@ -91,7 +92,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
}
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
{
struct timespec tv[2];
@@ -106,7 +107,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
return do_utimes(dfd, filename, t ? tv : NULL, flags);
}
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
{
struct timespec tv[2];
@@ -125,7 +126,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
{
return compat_sys_futimesat(AT_FDCWD, filename, t);
}
@@ -169,7 +170,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return err;
}
-asmlinkage long compat_sys_newstat(char __user * filename,
+asmlinkage long compat_sys_newstat(const char __user * filename,
struct compat_stat __user *statbuf)
{
struct kstat stat;
@@ -181,7 +182,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
return cp_compat_stat(&stat, statbuf);
}
-asmlinkage long compat_sys_newlstat(char __user * filename,
+asmlinkage long compat_sys_newlstat(const char __user * filename,
struct compat_stat __user *statbuf)
{
struct kstat stat;
@@ -194,7 +195,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
}
#ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+ const char __user *filename,
struct compat_stat __user *statbuf, int flag)
{
struct kstat stat;
@@ -837,9 +839,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
#define NCPFS_NAME "ncpfs"
#define NFS4_NAME "nfs4"
-asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
- char __user * type, unsigned long flags,
- void __user * data)
+asmlinkage long compat_sys_mount(const char __user * dev_name,
+ const char __user * dir_name,
+ const char __user * type, unsigned long flags,
+ const void __user * data)
{
char *kernel_type;
unsigned long data_page;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 70227e0dc01..03e59aa318e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1699,8 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
goto out_fput;
}
- if (!filp->f_op ||
- (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
+ if (!filp->f_op || !filp->f_op->unlocked_ioctl)
goto do_ioctl;
break;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a53b130b366..1e7a33028d3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
}
} else {
inode = iget_locked(sb, CRAMINO(cramfs_inode));
- if (inode) {
+ if (inode && (inode->i_state & I_NEW)) {
setup_inode(inode, cramfs_inode);
unlock_new_inode(inode);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 166d35d5686..83293be4814 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
* d_lookup - search for a dentry
* @parent: parent dentry
* @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
*
- * Searches the children of the parent dentry for the name in question. If
- * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal.
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
+ * d_lookup searches the children of the parent dentry for the name in
+ * question. If the dentry is found its reference count is incremented and the
+ * dentry is returned. The caller must use dput to free the entry when it has
+ * finished using it. %NULL is returned if the dentry does not exist.
*/
-
struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
{
struct dentry * dentry = NULL;
@@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
}
EXPORT_SYMBOL(d_lookup);
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
unsigned int len = name->len;
@@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
struct hlist_node *node;
struct dentry *dentry;
+ /*
+ * The hash list is protected using RCU.
+ *
+ * Take d_lock when comparing a candidate dentry, to avoid races
+ * with d_move().
+ *
+ * It is possible that concurrent renames can mess up our list
+ * walk here and result in missing our dentry, resulting in the
+ * false-negative result. d_lookup() protects against concurrent
+ * renames using rename_lock seqlock.
+ *
+ * See Documentation/vfs/dcache-locking.txt for more details.
+ */
rcu_read_lock();
hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
/*
* Recheck the dentry after taking the lock - d_move may have
- * changed things. Don't bother checking the hash because we're
- * about to compare the whole name anyway.
+ * changed things. Don't bother checking the hash because
+ * we're about to compare the whole name anyway.
*/
if (dentry->d_parent != parent)
goto next;
@@ -1925,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
bool slash = false;
int error = 0;
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
@@ -1954,7 +1964,7 @@ out:
if (!error && !slash)
error = prepend(buffer, buflen, "/", 1);
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return error;
global_root:
@@ -1976,7 +1986,7 @@ global_root:
* __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
- * @buffer: buffer to return value in
+ * @buf: buffer to return value in
* @buflen: buffer length
*
* Convert a dentry into an ASCII path name.
@@ -2292,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
struct vfsmount *mnt = path1->mnt;
struct dentry *dentry = path1->dentry;
int res;
- spin_lock(&vfsmount_lock);
+
+ br_read_lock(vfsmount_lock);
if (mnt != path2->mnt) {
for (;;) {
if (mnt->mnt_parent == mnt) {
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return 0;
}
if (mnt->mnt_parent == path2->mnt)
@@ -2306,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
dentry = mnt->mnt_mountpoint;
}
res = is_subdir(dentry, path2->dentry);
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return res;
}
EXPORT_SYMBOL(path_is_under);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65..cbadc1bee6e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
static struct list_head key_tfm_list;
struct mutex key_tfm_list_mutex;
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
{
mutex_init(&key_tfm_list_mutex);
INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ encoded_name_no_prefix_size);
(*encoded_name)[(*encoded_name_size)] = '\0';
- (*encoded_name_size)++;
} else {
rc = -EOPNOTSUPP;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e722..3fbc9420338 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
/**
* ecryptfs_new_lower_dentry
- * @ename: The name of the new dentry.
+ * @name: The name of the new dentry.
* @lower_dir_dentry: Parent directory of the new dentry.
* @nd: nameidata from last lookup.
*
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
* ecryptfs_lookup_one_lower
* @ecryptfs_dentry: The eCryptfs dentry that we are looking up
* @lower_dir_dentry: lower parent directory
+ * @name: lower file name
*
* Get the lower dentry from vfs. If lower dentry does not exist yet,
* create it.
*/
static struct dentry *
ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
- struct dentry *lower_dir_dentry)
+ struct dentry *lower_dir_dentry, struct qstr *name)
{
struct nameidata nd;
struct vfsmount *lower_mnt;
- struct qstr *name;
int err;
- name = &ecryptfs_dentry->d_name;
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
ecryptfs_dentry->d_parent));
err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
size_t encrypted_and_encoded_name_size;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
struct dentry *lower_dir_dentry, *lower_dentry;
+ struct qstr lower_name;
int rc = 0;
ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
goto out_d_drop;
}
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-
+ lower_name.name = ecryptfs_dentry->d_name.name;
+ lower_name.len = ecryptfs_dentry->d_name.len;
+ lower_name.hash = ecryptfs_dentry->d_name.hash;
+ if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+ rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+ &lower_name);
+ if (rc < 0)
+ goto out_d_drop;
+ }
lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
- lower_dir_dentry);
+ lower_dir_dentry, &lower_name);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
"filename; rc = [%d]\n", __func__, rc);
goto out_d_drop;
}
+ lower_name.name = encrypted_and_encoded_name;
+ lower_name.len = encrypted_and_encoded_name_size;
+ lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+ if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+ rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+ &lower_name);
+ if (rc < 0)
+ goto out_d_drop;
+ }
lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
- lower_dir_dentry);
+ lower_dir_dentry, &lower_name);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506e..73811cfa2ea 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ rc = -ENOMEM;
goto out;
}
s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ rc = -ENOMEM;
goto out;
}
s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aaf..0851ab6980f 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
return 0;
}
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
{
int rc = 0;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f..ab224809051 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
return rc;
}
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
{
int i;
int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd..00208c3d7e9 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
{
int rc;
diff --git a/fs/exec.c b/fs/exec.c
index 7761837e450..2d945528274 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -361,13 +361,13 @@ err:
/*
* count() counts the number of strings in array ARGV.
*/
-static int count(char __user * __user * argv, int max)
+static int count(const char __user * const __user * argv, int max)
{
int i = 0;
if (argv != NULL) {
for (;;) {
- char __user * p;
+ const char __user * p;
if (get_user(p, argv))
return -EFAULT;
@@ -387,7 +387,7 @@ static int count(char __user * __user * argv, int max)
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
struct linux_binprm *bprm)
{
struct page *kmapped_page = NULL;
@@ -396,7 +396,7 @@ static int copy_strings(int argc, char __user * __user * argv,
int ret;
while (argc-- > 0) {
- char __user *str;
+ const char __user *str;
int len;
unsigned long pos;
@@ -470,12 +470,13 @@ out:
/*
* Like copy_strings, but get argv and its values from kernel memory.
*/
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *argv,
+ struct linux_binprm *bprm)
{
int r;
mm_segment_t oldfs = get_fs();
set_fs(KERNEL_DS);
- r = copy_strings(argc, (char __user * __user *)argv, bprm);
+ r = copy_strings(argc, (const char __user *const __user *)argv, bprm);
set_fs(oldfs);
return r;
}
@@ -997,7 +998,7 @@ EXPORT_SYMBOL(flush_old_exec);
void setup_new_exec(struct linux_binprm * bprm)
{
int i, ch;
- char * name;
+ const char *name;
char tcomm[sizeof(current->comm)];
arch_pick_mmap_layout(current->mm);
@@ -1117,7 +1118,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe = tracehook_unsafe_exec(p);
n_fs = 1;
- write_lock(&p->fs->lock);
+ spin_lock(&p->fs->lock);
rcu_read_lock();
for (t = next_thread(p); t != p; t = next_thread(t)) {
if (t->fs == p->fs)
@@ -1134,7 +1135,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
res = 1;
}
}
- write_unlock(&p->fs->lock);
+ spin_unlock(&p->fs->lock);
return res;
}
@@ -1316,9 +1317,9 @@ EXPORT_SYMBOL(search_binary_handler);
/*
* sys_execve() executes a new program.
*/
-int do_execve(char * filename,
- char __user *__user *argv,
- char __user *__user *envp,
+int do_execve(const char * filename,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp,
struct pt_regs * regs)
{
struct linux_binprm *bprm;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba..1736f235638 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
{
int i, err = 0;
- ll_rw_block(SWRITE, nr_bhs, bhs);
+ for (i = 0; i < nr_bhs; i++)
+ write_dirty_buffer(bhs[i], WRITE);
+
for (i = 0; i < nr_bhs; i++) {
wait_on_buffer(bhs[i]);
if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/file_table.c b/fs/file_table.c
index 2fc3b3c0891..a04bdd81c11 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
+#include <linux/lglock.h>
#include <linux/percpu_counter.h>
+#include <linux/percpu.h>
#include <linux/ima.h>
#include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
-/* public. Not pretty! */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DECLARE_LGLOCK(files_lglock);
+DEFINE_LGLOCK(files_lglock);
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;
@@ -230,15 +232,6 @@ static void __fput(struct file *file)
might_sleep();
fsnotify_close(file);
-
- /*
- * fsnotify_create_event may have taken one or more references on this
- * file. If it did so it left one reference for us to drop to make sure
- * its calls to fput could not prematurely destroy the file.
- */
- if (atomic_long_read(&file->f_count))
- return fput(file);
-
/*
* The function eventpoll_release() should be the first called
* in the file cleanup chain.
@@ -258,7 +251,7 @@ static void __fput(struct file *file)
cdev_put(inode->i_cdev);
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- file_kill(file);
+ file_sb_list_del(file);
if (file->f_mode & FMODE_WRITE)
drop_file_write_access(file);
file->f_path.dentry = NULL;
@@ -337,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
return file;
}
-
void put_filp(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
security_file_free(file);
- file_kill(file);
+ file_sb_list_del(file);
file_free(file);
}
}
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
{
- if (!list)
- return;
- file_list_lock();
- list_move(&file->f_u.fu_list, list);
- file_list_unlock();
+#ifdef CONFIG_SMP
+ return file->f_sb_list_cpu;
+#else
+ return smp_processor_id();
+#endif
+}
+
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+ struct list_head *list;
+#ifdef CONFIG_SMP
+ int cpu;
+ cpu = smp_processor_id();
+ file->f_sb_list_cpu = cpu;
+ list = per_cpu_ptr(sb->s_files, cpu);
+#else
+ list = &sb->s_files;
+#endif
+ list_add(&file->f_u.fu_list, list);
}
-void file_kill(struct file *file)
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+ lg_local_lock(files_lglock);
+ __file_sb_list_add(file, sb);
+ lg_local_unlock(files_lglock);
+}
+
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
{
if (!list_empty(&file->f_u.fu_list)) {
- file_list_lock();
+ lg_local_lock_cpu(files_lglock, file_list_cpu(file));
list_del_init(&file->f_u.fu_list);
- file_list_unlock();
+ lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
}
}
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file) \
+{ \
+ int i; \
+ for_each_possible_cpu(i) { \
+ struct list_head *list; \
+ list = per_cpu_ptr((__sb)->s_files, i); \
+ list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry \
+ } \
+}
+
+#else
+
+#define do_file_list_for_each_entry(__sb, __file) \
+{ \
+ struct list_head *list; \
+ list = &(sb)->s_files; \
+ list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry \
+}
+
+#endif
+
int fs_may_remount_ro(struct super_block *sb)
{
struct file *file;
-
/* Check that no files are currently opened for writing. */
- file_list_lock();
- list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+ lg_global_lock(files_lglock);
+ do_file_list_for_each_entry(sb, file) {
struct inode *inode = file->f_path.dentry->d_inode;
/* File with pending delete? */
@@ -381,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
/* Writeable file? */
if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
goto too_bad;
- }
- file_list_unlock();
+ } while_file_list_for_each_entry;
+ lg_global_unlock(files_lglock);
return 1; /* Tis' cool bro. */
too_bad:
- file_list_unlock();
+ lg_global_unlock(files_lglock);
return 0;
}
@@ -401,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
struct file *f;
retry:
- file_list_lock();
- list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+ lg_global_lock(files_lglock);
+ do_file_list_for_each_entry(sb, f) {
struct vfsmount *mnt;
if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
continue;
@@ -417,16 +476,13 @@ retry:
continue;
file_release_write(f);
mnt = mntget(f->f_path.mnt);
- file_list_unlock();
- /*
- * This can sleep, so we can't hold
- * the file_list_lock() spinlock.
- */
+ /* This can sleep, so we can't hold the spinlock. */
+ lg_global_unlock(files_lglock);
mnt_drop_write(mnt);
mntput(mnt);
goto retry;
- }
- file_list_unlock();
+ } while_file_list_for_each_entry;
+ lg_global_unlock(files_lglock);
}
void __init files_init(unsigned long mempages)
@@ -446,5 +502,6 @@ void __init files_init(unsigned long mempages)
if (files_stat.max_files < NR_FILE)
files_stat.max_files = NR_FILE;
files_defer_init();
+ lg_lock_init(files_lglock);
percpu_counter_init(&nr_files, 0);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2f76c4a081a..7d9d06ba184 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -68,7 +68,7 @@ int nr_pdflush_threads;
*/
int writeback_in_progress(struct backing_dev_info *bdi)
{
- return !list_empty(&bdi->work_list);
+ return test_bit(BDI_writeback_running, &bdi->state);
}
static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -249,10 +249,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
* Queue all expired dirty inodes for io, eldest first.
+ * Before
+ * newly dirtied b_dirty b_io b_more_io
+ * =============> gf edc BA
+ * After
+ * newly dirtied b_dirty b_io b_more_io
+ * =============> g fBAedc
+ * |
+ * +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- list_splice_init(&wb->b_more_io, wb->b_io.prev);
+ list_splice_init(&wb->b_more_io, &wb->b_io);
move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
@@ -363,62 +371,35 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode_lock);
inode->i_state &= ~I_SYNC;
if (!(inode->i_state & I_FREEING)) {
- if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
- /*
- * More pages get dirtied by a fast dirtier.
- */
- goto select_queue;
- } else if (inode->i_state & I_DIRTY) {
- /*
- * At least XFS will redirty the inode during the
- * writeback (delalloc) and on io completion (isize).
- */
- redirty_tail(inode);
- } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
- * sometimes bales out without doing anything. Redirty
- * the inode; Move it from b_io onto b_more_io/b_dirty.
+ * sometimes bales out without doing anything.
*/
- /*
- * akpm: if the caller was the kupdate function we put
- * this inode at the head of b_dirty so it gets first
- * consideration. Otherwise, move it to the tail, for
- * the reasons described there. I'm not really sure
- * how much sense this makes. Presumably I had a good
- * reasons for doing it this way, and I'd rather not
- * muck with it at present.
- */
- if (wbc->for_kupdate) {
+ inode->i_state |= I_DIRTY_PAGES;
+ if (wbc->nr_to_write <= 0) {
/*
- * For the kupdate function we move the inode
- * to b_more_io so it will get more writeout as
- * soon as the queue becomes uncongested.
+ * slice used up: queue for next turn
*/
- inode->i_state |= I_DIRTY_PAGES;
-select_queue:
- if (wbc->nr_to_write <= 0) {
- /*
- * slice used up: queue for next turn
- */
- requeue_io(inode);
- } else {
- /*
- * somehow blocked: retry later
- */
- redirty_tail(inode);
- }
+ requeue_io(inode);
} else {
/*
- * Otherwise fully redirty the inode so that
- * other inodes on this superblock will get some
- * writeout. Otherwise heavy writing to one
- * file would indefinitely suspend writeout of
- * all the other files.
+ * Writeback blocked by something other than
+ * congestion. Delay the inode for some time to
+ * avoid spinning on the CPU (100% iowait)
+ * retrying writeback of the dirty page/inode
+ * that cannot be performed immediately.
*/
- inode->i_state |= I_DIRTY_PAGES;
redirty_tail(inode);
}
+ } else if (inode->i_state & I_DIRTY) {
+ /*
+ * Filesystems can dirty the inode during writeback
+ * operations, such as delayed allocation during
+ * submission or metadata updates after data IO
+ * completion.
+ */
+ redirty_tail(inode);
} else if (atomic_read(&inode->i_count)) {
/*
* The inode is clean, inuse
@@ -590,7 +571,7 @@ static inline bool over_bground_thresh(void)
{
unsigned long background_thresh, dirty_thresh;
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
return (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
@@ -759,6 +740,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
struct wb_writeback_work *work;
long wrote = 0;
+ set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
* Override sync mode, in case we must wait for completion
@@ -785,6 +767,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
+ clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 1ee40eb9a2c..ed45a9cf5f3 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
{
struct path old_root;
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
old_root = fs->root;
fs->root = *path;
path_get(path);
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
if (old_root.dentry)
path_put(&old_root);
}
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
{
struct path old_pwd;
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
old_pwd = fs->pwd;
fs->pwd = *path;
path_get(path);
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
if (old_pwd.dentry)
path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
task_lock(p);
fs = p->fs;
if (fs) {
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
if (fs->root.dentry == old_root->dentry
&& fs->root.mnt == old_root->mnt) {
path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
fs->pwd = *new_root;
count++;
}
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
}
task_unlock(p);
} while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
if (fs) {
int kill;
task_lock(tsk);
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
tsk->fs = NULL;
kill = !--fs->users;
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
task_unlock(tsk);
if (kill)
free_fs_struct(fs);
@@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
- rwlock_init(&fs->lock);
+ spin_lock_init(&fs->lock);
fs->umask = old->umask;
get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
}
@@ -121,10 +121,10 @@ int unshare_fs_struct(void)
return -ENOMEM;
task_lock(current);
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
kill = !--fs->users;
current->fs = new_fs;
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
task_unlock(current);
if (kill)
@@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
- .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
+ .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
.umask = 0022,
};
@@ -156,14 +156,14 @@ void daemonize_fs_struct(void)
task_lock(current);
- write_lock(&init_fs.lock);
+ spin_lock(&init_fs.lock);
init_fs.users++;
- write_unlock(&init_fs.lock);
+ spin_unlock(&init_fs.lock);
- write_lock(&fs->lock);
+ spin_lock(&fs->lock);
current->fs = &init_fs;
kill = !--fs->users;
- write_unlock(&fs->lock);
+ spin_unlock(&fs->lock);
task_unlock(current);
if (kill)
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6a026441c5a..f6aad48d38a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -321,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
#define dbgprintk(FMT, ...) \
printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf, 1, 2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#ifdef __KDEBUG
#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -358,9 +352,9 @@ do { \
} while (0)
#else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
/*
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 69ad053ffd7..d367af1514e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
* Called with fc->lock, unlocks it
*/
static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
{
void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
req->end = NULL;
@@ -306,8 +306,8 @@ __releases(&fc->lock)
static void wait_answer_interruptible(struct fuse_conn *fc,
struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
if (signal_pending(current))
return;
@@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
if (!fc->no_interrupt) {
/* Any signal may interrupt this */
@@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc)
/* Wait until a request is available on the pending list */
static void request_wait(struct fuse_conn *fc)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
DECLARE_WAITQUEUE(wait, current);
@@ -934,7 +934,7 @@ __acquires(&fc->lock)
*/
static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
size_t nbytes, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
{
struct fuse_in_header ih;
struct fuse_interrupt_in arg;
@@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
* This function releases and reacquires fc->lock
*/
static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -1744,8 +1744,8 @@ __acquires(&fc->lock)
* locked).
*/
static void end_io_requests(struct fuse_conn *fc)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
while (!list_empty(&fc->io)) {
struct fuse_req *req =
@@ -1769,6 +1769,16 @@ __acquires(&fc->lock)
}
}
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ fc->max_background = UINT_MAX;
+ flush_bg_queue(fc);
+ end_requests(fc, &fc->pending);
+ end_requests(fc, &fc->processing);
+}
+
/*
* Abort all requests.
*
@@ -1795,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
fc->connected = 0;
fc->blocked = 0;
end_io_requests(fc);
- end_requests(fc, &fc->pending);
- end_requests(fc, &fc->processing);
+ end_queued_requests(fc);
wake_up_all(&fc->waitq);
wake_up_all(&fc->blocked_waitq);
kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1811,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
if (fc) {
spin_lock(&fc->lock);
fc->connected = 0;
- end_requests(fc, &fc->pending);
- end_requests(fc, &fc->processing);
+ fc->blocked = 0;
+ end_queued_requests(fc);
+ wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
fuse_conn_put(fc);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 147c1f71bdb..c8224587123 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
/* Called under fc->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
loff_t size = i_size_read(req->inode);
@@ -1183,8 +1183,8 @@ __acquires(&fc->lock)
* Called with fc->lock
*/
void fuse_flush_writepages(struct inode *inode)
-__releases(&fc->lock)
-__acquires(&fc->lock)
+__releases(fc->lock)
+__acquires(fc->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e56415..6bc9e3a5a69 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
if (error < 0)
goto failed;
inode->i_mode = mode;
+ inode->i_ctime = CURRENT_TIME;
if (error == 0) {
posix_acl_release(acl);
acl = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dd1e55535a4..f7dc9b5f9ef 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
__putname(name);
return NULL;
}
- strncpy(name, root, PATH_MAX);
+ strlcpy(name, root, PATH_MAX);
if (len > p - name) {
__putname(name);
return NULL;
@@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
char *path = dentry_name(dentry);
int err = -ENOMEM;
if (path) {
- int err = hostfs_do_readlink(path, link, PATH_MAX);
+ err = hostfs_do_readlink(path, link, PATH_MAX);
if (err == PATH_MAX)
err = -E2BIG;
__putname(path);
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a6..a6910e91cee 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/lglock.h>
+
struct super_block;
struct linux_binprm;
struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
extern void __init mnt_init(void);
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
+
/*
* fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
/*
* file_table.c
*/
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
extern void mark_files_ro(struct super_block *);
extern struct file *get_empty_filp(void);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a71386..f855ea4fc88 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
* @arg: command-specific argument for ioctl
*
* Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
- * invokes filesystem specific ->ioctl method. If neither method exists,
* returns -ENOTTY.
*
* Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
{
int error = -ENOTTY;
- if (!filp->f_op)
+ if (!filp->f_op || !filp->f_op->unlocked_ioctl)
goto out;
- if (filp->f_op->unlocked_ioctl) {
- error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
- if (error == -ENOIOCTLCMD)
- error = -EINVAL;
- goto out;
- } else if (filp->f_op->ioctl) {
- lock_kernel();
- error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
- filp, cmd, arg);
- unlock_kernel();
- }
-
+ error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+ if (error == -ENOIOCTLCMD)
+ error = -EINVAL;
out:
return error;
}
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654..05a38b9c4c0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
- ll_rw_block(SWRITE, *batch_count, bhs);
+ for (i = 0; i < *batch_count; i++)
+ write_dirty_buffer(bhs[i], WRITE);
+
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c4..95d8c11c929 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
struct buffer_head *bh;
journal_header_t *header;
int ret;
- int barrier_done = 0;
if (is_journal_aborted(journal))
return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
JBUFFER_TRACE(descriptor, "write commit block");
set_buffer_dirty(bh);
+
if (journal->j_flags & JFS_BARRIER) {
- set_buffer_ordered(bh);
- barrier_done = 1;
- }
- ret = sync_dirty_buffer(bh);
- if (barrier_done)
- clear_buffer_ordered(bh);
- /* is it possible for another commit to fail at roughly
- * the same time as this one? If so, we don't want to
- * trust the barrier flag in the super, but instead want
- * to remember if we sent a barrier request
- */
- if (ret == -EOPNOTSUPP && barrier_done) {
- char b[BDEVNAME_SIZE];
+ ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
- printk(KERN_WARNING
- "JBD: barrier-based sync failed on %s - "
- "disabling barriers\n",
- bdevname(journal->j_dev, b));
- spin_lock(&journal->j_state_lock);
- journal->j_flags &= ~JFS_BARRIER;
- spin_unlock(&journal->j_state_lock);
+ /*
+ * Is it possible for another commit to fail at roughly
+ * the same time as this one? If so, we don't want to
+ * trust the barrier flag in the super, but instead want
+ * to remember if we sent a barrier request
+ */
+ if (ret == -EOPNOTSUPP) {
+ char b[BDEVNAME_SIZE];
- /* And try again, without the barrier */
- set_buffer_uptodate(bh);
- set_buffer_dirty(bh);
+ printk(KERN_WARNING
+ "JBD: barrier-based sync failed on %s - "
+ "disabling barriers\n",
+ bdevname(journal->j_dev, b));
+ spin_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JFS_BARRIER;
+ spin_unlock(&journal->j_state_lock);
+
+ /* And try again, without the barrier */
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ ret = sync_dirty_buffer(bh);
+ }
+ } else {
ret = sync_dirty_buffer(bh);
}
+
put_bh(bh); /* One for getblk() */
journal_put_journal_head(descriptor);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f19ce94693d..2c4b1f109da 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
if (wait)
sync_dirty_buffer(bh);
else
- ll_rw_block(SWRITE, 1, &bh);
+ write_dirty_buffer(bh, WRITE);
out:
/* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343..d29018307e2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
- ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+ write_dirty_buffer(bh, write_op);
}
#endif
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c23a0f4e8a..5247e7ffdcb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
{
int i;
- ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+ for (i = 0; i < *batch_count; i++)
+ write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
+
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
clear_buffer_jwrite(bh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f52e5e8049f..7c068c189d8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
- int barrier_done = 0;
struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
- set_buffer_ordered(bh);
- barrier_done = 1;
- }
- ret = submit_bh(WRITE_SYNC_PLUG, bh);
- if (barrier_done)
- clear_buffer_ordered(bh);
-
- /* is it possible for another commit to fail at roughly
- * the same time as this one? If so, we don't want to
- * trust the barrier flag in the super, but instead want
- * to remember if we sent a barrier request
- */
- if (ret == -EOPNOTSUPP && barrier_done) {
- printk(KERN_WARNING
- "JBD2: Disabling barriers on %s, "
- "not supported by device\n", journal->j_devname);
- write_lock(&journal->j_state_lock);
- journal->j_flags &= ~JBD2_BARRIER;
- write_unlock(&journal->j_state_lock);
+ ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
+ if (ret == -EOPNOTSUPP) {
+ printk(KERN_WARNING
+ "JBD2: Disabling barriers on %s, "
+ "not supported by device\n", journal->j_devname);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_BARRIER;
+ write_unlock(&journal->j_state_lock);
- /* And try again, without the barrier */
- lock_buffer(bh);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
+ /* And try again, without the barrier */
+ lock_buffer(bh);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ ret = submit_bh(WRITE_SYNC_PLUG, bh);
+ }
+ } else {
ret = submit_bh(WRITE_SYNC_PLUG, bh);
}
*cbh = bh;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ad5866aaf0f..0e8014ea6b9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
set_buffer_uptodate(bh);
}
} else
- ll_rw_block(SWRITE, 1, &bh);
+ write_dirty_buffer(bh, WRITE);
out:
/* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e..9ad321fd63f 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
- ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+ write_dirty_buffer(bh, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d95cc9d0401..f3479d6e0a8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,6 +82,32 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/
/*
+ * Update transiaction's maximum wait time, if debugging is enabled.
+ *
+ * In order for t_max_wait to be reliable, it must be protected by a
+ * lock. But doing so will mean that start_this_handle() can not be
+ * run in parallel on SMP systems, which limits our scalability. So
+ * unless debugging is enabled, we no longer update t_max_wait, which
+ * means that maximum wait time reported by the jbd2_run_stats
+ * tracepoint will always be zero.
+ */
+static inline void update_t_max_wait(transaction_t *transaction)
+{
+#ifdef CONFIG_JBD2_DEBUG
+ unsigned long ts = jiffies;
+
+ if (jbd2_journal_enable_debug &&
+ time_after(transaction->t_start, ts)) {
+ ts = jbd2_time_diff(ts, transaction->t_start);
+ spin_lock(&transaction->t_handle_lock);
+ if (ts > transaction->t_max_wait)
+ transaction->t_max_wait = ts;
+ spin_unlock(&transaction->t_handle_lock);
+ }
+#endif
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -95,7 +121,6 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
int needed;
int nblocks = handle->h_buffer_credits;
transaction_t *new_transaction = NULL;
- unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -241,25 +266,8 @@ repeat:
/* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction.
- *
- * In order for t_max_wait to be reliable, it must be
- * protected by a lock. But doing so will mean that
- * start_this_handle() can not be run in parallel on SMP
- * systems, which limits our scalability. So we only enable
- * it when debugging is enabled. We may want to use a
- * separate flag, eventually, so we can enable this
- * independently of debugging.
*/
-#ifdef CONFIG_JBD2_DEBUG
- if (jbd2_journal_enable_debug &&
- time_after(transaction->t_start, ts)) {
- ts = jbd2_time_diff(ts, transaction->t_start);
- spin_lock(&transaction->t_handle_lock);
- if (ts > transaction->t_max_wait)
- transaction->t_max_wait = ts;
- spin_unlock(&transaction->t_handle_lock);
- }
-#endif
+ update_t_max_wait(transaction);
handle->h_transaction = transaction;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 675cc49197f..9777eb5b552 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -824,7 +824,7 @@ const struct inode_operations logfs_dir_iops = {
};
const struct file_operations logfs_dir_fops = {
.fsync = logfs_fsync,
- .ioctl = logfs_ioctl,
+ .unlocked_ioctl = logfs_ioctl,
.readdir = logfs_readdir,
.read = generic_read_dir,
};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 4dd0f7c06e3..e86376b87af 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -181,9 +181,9 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
}
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
- unsigned long arg)
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = file->f_path.dentry->d_inode;
struct logfs_inode *li = logfs_inode(inode);
unsigned int oldflags, flags;
int err;
@@ -255,7 +255,7 @@ const struct file_operations logfs_reg_fops = {
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
.fsync = logfs_fsync,
- .ioctl = logfs_ioctl,
+ .unlocked_ioctl = logfs_ioctl,
.llseek = generic_file_llseek,
.mmap = generic_file_readonly_mmap,
.open = generic_file_open,
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5e3b7207795..b8786264d24 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -504,8 +504,7 @@ extern const struct inode_operations logfs_reg_iops;
extern const struct file_operations logfs_reg_fops;
extern const struct address_space_operations logfs_reg_aops;
int logfs_readpage(struct file *file, struct page *page);
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
- unsigned long arg);
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int logfs_fsync(struct file *file, int datasync);
/* gc.c */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index cf4e6cdfd15..93444747237 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -80,6 +80,7 @@ struct mb_cache {
struct list_head c_cache_list;
const char *c_name;
atomic_t c_entry_count;
+ int c_max_entries;
int c_bucket_bits;
struct kmem_cache *c_entry_cache;
struct list_head *c_block_hash;
@@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits)
if (!cache->c_entry_cache)
goto fail2;
+ /*
+ * Set an upper limit on the number of cache entries so that the hash
+ * chains won't grow too long.
+ */
+ cache->c_max_entries = bucket_count << 4;
+
spin_lock(&mb_cache_spinlock);
list_add(&cache->c_cache_list, &mb_cache_list);
spin_unlock(&mb_cache_spinlock);
@@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache)
kfree(cache);
}
-
/*
* mb_cache_entry_alloc()
*
@@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
struct mb_cache_entry *
mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
{
- struct mb_cache_entry *ce;
-
- ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
- if (ce) {
+ struct mb_cache_entry *ce = NULL;
+
+ if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+ spin_lock(&mb_cache_spinlock);
+ if (!list_empty(&mb_cache_lru_list)) {
+ ce = list_entry(mb_cache_lru_list.next,
+ struct mb_cache_entry, e_lru_list);
+ list_del_init(&ce->e_lru_list);
+ __mb_cache_entry_unhash(ce);
+ }
+ spin_unlock(&mb_cache_spinlock);
+ }
+ if (!ce) {
+ ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+ if (!ce)
+ return NULL;
atomic_inc(&cache->c_entry_count);
INIT_LIST_HEAD(&ce->e_lru_list);
INIT_LIST_HEAD(&ce->e_block_list);
ce->e_cache = cache;
- ce->e_used = 1 + MB_CACHE_WRITER;
ce->e_queued = 0;
}
+ ce->e_used = 1 + MB_CACHE_WRITER;
return ce;
}
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76bf2fb..24896e83356 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
{
struct vfsmount *parent;
struct dentry *mountpoint;
- spin_lock(&vfsmount_lock);
+
+ br_read_lock(vfsmount_lock);
parent = path->mnt->mnt_parent;
if (parent == path->mnt) {
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return 0;
}
mntget(parent);
mountpoint = dget(path->mnt->mnt_mountpoint);
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
@@ -686,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
}
/*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+ struct qstr *name, struct nameidata *nd)
+{
+ struct inode *inode = parent->d_inode;
+ struct dentry *dentry;
+ struct dentry *old;
+
+ /* Don't create child dentry for a dead directory. */
+ if (unlikely(IS_DEADDIR(inode)))
+ return ERR_PTR(-ENOENT);
+
+ dentry = d_alloc(parent, name);
+ if (unlikely(!dentry))
+ return ERR_PTR(-ENOMEM);
+
+ old = inode->i_op->lookup(inode, dentry, nd);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
+ }
+ return dentry;
+}
+
+/*
* It's more convoluted than I'd like it to be, but... it's still fairly
* small and for now I'd prefer to have fast path as straight as possible.
* It _is_ time-critical.
@@ -706,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
return err;
}
+ /*
+ * Rename seqlock is not required here because in the off chance
+ * of a false negative due to a concurrent rename, we're going to
+ * do the non-racy lookup, below.
+ */
dentry = __d_lookup(nd->path.dentry, name);
if (!dentry)
goto need_lookup;
+found:
if (dentry->d_op && dentry->d_op->d_revalidate)
goto need_revalidate;
done:
@@ -724,56 +760,28 @@ need_lookup:
mutex_lock(&dir->i_mutex);
/*
* First re-do the cached lookup just in case it was created
- * while we waited for the directory semaphore..
+ * while we waited for the directory semaphore, or the first
+ * lookup failed due to an unrelated rename.
*
- * FIXME! This could use version numbering or similar to
- * avoid unnecessary cache lookups.
- *
- * The "dcache_lock" is purely to protect the RCU list walker
- * from concurrent renames at this point (we mustn't get false
- * negatives from the RCU list walk here, unlike the optimistic
- * fast walk).
- *
- * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+ * This could use version numbering or similar to avoid unnecessary
+ * cache lookups, but then we'd have to do the first lookup in the
+ * non-racy way. However in the common case here, everything should
+ * be hot in cache, so would it be a big win?
*/
dentry = d_lookup(parent, name);
- if (!dentry) {
- struct dentry *new;
-
- /* Don't create child dentry for a dead directory. */
- dentry = ERR_PTR(-ENOENT);
- if (IS_DEADDIR(dir))
- goto out_unlock;
-
- new = d_alloc(parent, name);
- dentry = ERR_PTR(-ENOMEM);
- if (new) {
- dentry = dir->i_op->lookup(dir, new, nd);
- if (dentry)
- dput(new);
- else
- dentry = new;
- }
-out_unlock:
+ if (likely(!dentry)) {
+ dentry = d_alloc_and_lookup(parent, name, nd);
mutex_unlock(&dir->i_mutex);
if (IS_ERR(dentry))
goto fail;
goto done;
}
-
/*
* Uhhuh! Nasty case: the cache was re-populated while
* we waited on the semaphore. Need to revalidate.
*/
mutex_unlock(&dir->i_mutex);
- if (dentry->d_op && dentry->d_op->d_revalidate) {
- dentry = do_revalidate(dentry, nd);
- if (!dentry)
- dentry = ERR_PTR(-ENOENT);
- }
- if (IS_ERR(dentry))
- goto fail;
- goto done;
+ goto found;
need_revalidate:
dentry = do_revalidate(dentry, nd);
@@ -1130,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
goto out;
}
- dentry = __d_lookup(base, name);
-
- /* lockess __d_lookup may fail due to concurrent d_move()
- * in some unrelated directory, so try with d_lookup
+ /*
+ * Don't bother with __d_lookup: callers are for creat as
+ * well as unlink, so a lot of the time it would cost
+ * a double lookup.
*/
- if (!dentry)
- dentry = d_lookup(base, name);
+ dentry = d_lookup(base, name);
if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
dentry = do_revalidate(dentry, nd);
- if (!dentry) {
- struct dentry *new;
-
- /* Don't create child dentry for a dead directory. */
- dentry = ERR_PTR(-ENOENT);
- if (IS_DEADDIR(inode))
- goto out;
-
- new = d_alloc(base, name);
- dentry = ERR_PTR(-ENOMEM);
- if (!new)
- goto out;
- dentry = inode->i_op->lookup(inode, new, nd);
- if (!dentry)
- dentry = new;
- else
- dput(new);
- }
+ if (!dentry)
+ dentry = d_alloc_and_lookup(base, name, nd);
out:
return dentry;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b..a72eaabfe8f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -38,12 +40,10 @@
#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
#define HASH_SIZE (1UL << HASH_SHIFT)
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
static int event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
static int mnt_id_start = 0;
static int mnt_group_start = 1;
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
static int mnt_alloc_id(struct vfsmount *mnt)
{
int res;
retry:
ida_pre_get(&mnt_id_ida, GFP_KERNEL);
- spin_lock(&vfsmount_lock);
+ spin_lock(&mnt_id_lock);
res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
if (!res)
mnt_id_start = mnt->mnt_id + 1;
- spin_unlock(&vfsmount_lock);
+ spin_unlock(&mnt_id_lock);
if (res == -EAGAIN)
goto retry;
@@ -86,11 +99,11 @@ retry:
static void mnt_free_id(struct vfsmount *mnt)
{
int id = mnt->mnt_id;
- spin_lock(&vfsmount_lock);
+ spin_lock(&mnt_id_lock);
ida_remove(&mnt_id_ida, id);
if (mnt_id_start > id)
mnt_id_start = id;
- spin_unlock(&vfsmount_lock);
+ spin_unlock(&mnt_id_lock);
}
/*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
{
int ret = 0;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
*/
smp_wmb();
mnt->mnt_flags &= ~MNT_WRITE_HOLD;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
return ret;
}
static void __mnt_unmake_readonly(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_flags &= ~MNT_READONLY;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
/*
* find the first or last mount at @dentry on vfsmount @mnt depending on
* @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
*/
struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *lookup_mnt(struct path *path)
{
struct vfsmount *child_mnt;
- spin_lock(&vfsmount_lock);
+
+ br_read_lock(vfsmount_lock);
if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
mntget(child_mnt);
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return child_mnt;
}
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+/*
+ * vfsmount lock must be held for write
+ */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
}
}
+/*
+ * vfsmount lock must be held for write
+ */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
}
+/*
+ * vfsmount lock must be held for write
+ */
static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
old_path->dentry->d_mounted--;
}
+/*
+ * vfsmount lock must be held for write
+ */
void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *child_mnt)
{
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
dentry->d_mounted++;
}
+/*
+ * vfsmount lock must be held for write
+ */
static void attach_mnt(struct vfsmount *mnt, struct path *path)
{
mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
}
/*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
*/
static void commit_tree(struct vfsmount *mnt)
{
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
void mntput_no_expire(struct vfsmount *mnt)
{
repeat:
- if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
- if (likely(!mnt->mnt_pinned)) {
- spin_unlock(&vfsmount_lock);
- __mntput(mnt);
- return;
- }
- atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
- mnt->mnt_pinned = 0;
- spin_unlock(&vfsmount_lock);
- acct_auto_close_mnt(mnt);
- goto repeat;
+ if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+ return;
+ br_write_lock(vfsmount_lock);
+ if (!atomic_dec_and_test(&mnt->mnt_count)) {
+ br_write_unlock(vfsmount_lock);
+ return;
}
+ if (likely(!mnt->mnt_pinned)) {
+ br_write_unlock(vfsmount_lock);
+ __mntput(mnt);
+ return;
+ }
+ atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+ mnt->mnt_pinned = 0;
+ br_write_unlock(vfsmount_lock);
+ acct_auto_close_mnt(mnt);
+ goto repeat;
}
-
EXPORT_SYMBOL(mntput_no_expire);
void mnt_pin(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_pinned++;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (mnt->mnt_pinned) {
atomic_inc(&mnt->mnt_count);
mnt->mnt_pinned--;
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
struct mnt_namespace *ns = p->ns;
int res = 0;
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
if (p->event != ns->event) {
p->event = ns->event;
res = 1;
}
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return res;
}
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
int minimum_refs = 0;
struct vfsmount *p;
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += atomic_read(&p->mnt_count);
minimum_refs += 2;
}
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
if (actual_refs > minimum_refs)
return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
if (propagate_mount_busy(mnt, 2))
ret = 0;
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
up_read(&namespace_sem);
return ret;
}
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
if (mnt->mnt_parent != mnt) {
struct dentry *dentry;
struct vfsmount *m;
- spin_lock(&vfsmount_lock);
+
+ br_write_lock(vfsmount_lock);
dentry = mnt->mnt_mountpoint;
m = mnt->mnt_parent;
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
m->mnt_ghosts--;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
dput(dentry);
mntput(m);
}
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
}
}
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
}
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
event++;
if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
umount_tree(mnt, 1, &umount_list);
retval = 0;
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
q = clone_mnt(p, p->mnt_root, flag);
if (!q)
goto Enomem;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &path);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
}
return res;
Enomem:
if (res) {
LIST_HEAD(umount_list);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(res, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
release_mounts(&umount_list);
}
return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
LIST_HEAD(umount_list);
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
}
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
if (err)
goto out_cleanup_ids;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
list_del_init(&child->mnt_hash);
commit_tree(child);
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
+
return 0;
out_cleanup_ids:
@@ -1444,13 +1484,30 @@ out_unlock:
}
/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+ int type = flags & ~MS_REC;
+
+ /* Fail if any non-propagation flags are set */
+ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+ return 0;
+ /* Only one propagation flag should be set */
+ if (!is_power_of_2(type))
+ return 0;
+ return type;
+}
+
+/*
* recursively change the type of the mountpoint.
*/
static int do_change_type(struct path *path, int flag)
{
struct vfsmount *m, *mnt = path->mnt;
int recurse = flag & MS_REC;
- int type = flag & ~MS_REC;
+ int type;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -1459,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
+ type = flags_to_propagation_type(flag);
+ if (!type)
+ return -EINVAL;
+
down_write(&namespace_sem);
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
@@ -1466,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
goto out_unlock;
}
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
out_unlock:
up_write(&namespace_sem);
@@ -1513,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
err = graft_tree(mnt, path);
if (err) {
LIST_HEAD(umount_list);
- spin_lock(&vfsmount_lock);
+
+ br_write_lock(vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
release_mounts(&umount_list);
}
@@ -1568,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
path->mnt->mnt_flags = mnt_flags;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
up_write(&sb->s_umount);
if (!err) {
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
touch_mnt_namespace(path->mnt->mnt_ns);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
return err;
}
@@ -1754,7 +1816,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
return;
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -1773,7 +1835,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1, &umounts);
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umounts);
@@ -1830,6 +1892,8 @@ resume:
/*
* process a list of expirable mountpoints with the intent of discarding any
* submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
*/
static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
{
@@ -2048,9 +2112,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
kfree(new_ns);
return ERR_PTR(-ENOMEM);
}
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2308,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out2; /* not attached */
/* make sure we can reach put_old from new_root */
tmp = old.mnt;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (tmp != new.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
@@ -2264,7 +2328,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount new_root on / */
attach_mnt(new.mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
chroot_fs_refs(&root, &new);
error = 0;
path_put(&root_parent);
@@ -2279,7 +2343,7 @@ out1:
out0:
return error;
out3:
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
goto out2;
}
@@ -2326,6 +2390,8 @@ void __init mnt_init(void)
for (u = 0; u < HASH_SIZE; u++)
INIT_LIST_HEAD(&mount_hashtable[u]);
+ br_lock_init(vfsmount_lock);
+
err = sysfs_init();
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2410,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
if (!atomic_dec_and_test(&ns->count))
return;
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(ns->root, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
kfree(ns);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index cc1bb33b59b..6c2aad49d73 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,7 +63,6 @@ config NFS_V3_ACL
config NFS_V4
bool "NFS client support for NFS version 4"
depends on NFS_FS
- select RPCSEC_GSS_KRB5
help
This option enables support for version 4 of the NFS protocol
(RFC 3530) in the kernel's NFS client.
@@ -100,3 +99,20 @@ config NFS_FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+ bool "Use the legacy NFS DNS resolver"
+ depends on NFS_V4
+ help
+ The kernel now provides a method for translating a host name into an
+ IP address. Select Y here if you would rather use your own DNS
+ resolver script.
+
+ If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+ bool
+ depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+ select DNS_RESOLVER
+ select KEYS
+ default y
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae9512..e17b49e2eab 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
#define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
unsigned long num;
int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
*((unsigned int *)kp->arg) = num;
return 0;
}
-
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
- return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 29539ceeb74..e257172d438 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
/* Call generic open code in order to cache credentials */
res = nfs_open(inode, filp);
+ if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+ /* This is a mountpoint, so d_revalidate will never
+ * have been called, so we need to refresh the
+ * inode (for close-open consistency) ourselves.
+ */
+ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ }
return res;
}
@@ -1103,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
goto no_open_dput;
/* We can't create new files, or truncate existing ones here */
- openflags &= ~(O_CREAT|O_TRUNC);
+ openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
/*
* Note: we're not holding inode->i_mutex and so may be racing with
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d002..dba50a5625d 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
* Resolves DNS hostnames into valid ip addresses
*/
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ ssize_t ret;
+ char *ip_addr = NULL;
+ int ip_len;
+
+ ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+ if (ip_len > 0)
+ ret = rpc_pton(ip_addr, ip_len, sa, salen);
+ else
+ ret = -ESRCH;
+ kfree(ip_addr);
+ return ret;
+}
+
+#else
+
#include <linux/hash.h>
#include <linux/string.h>
#include <linux/kmod.h>
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
nfs_cache_unregister(&nfs_dns_resolve);
}
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf..199bb5543a9 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
extern int nfs_dns_resolver_init(void);
extern void nfs_dns_resolver_destroy(void);
+#endif
+
extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
struct sockaddr *sa, size_t salen);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2d141a74ae8..eb51bd6201d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -323,7 +323,7 @@ nfs_file_fsync(struct file *file, int datasync)
have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
if (have_error)
ret = xchg(&ctx->error, 0);
- if (!ret)
+ if (!ret && status < 0)
ret = status;
return ret;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ffbb98ddec..089da5b5d20 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2036,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
struct rpc_cred *cred;
struct nfs4_state *state;
struct dentry *res;
- fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+ int open_flags = nd->intent.open.flags;
+ fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
if (nd->flags & LOOKUP_CREATE) {
attr.ia_mode = nd->intent.open.create_mode;
@@ -2044,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
if (!IS_POSIXACL(dir))
attr.ia_mode &= ~current_umask();
} else {
+ open_flags &= ~O_EXCL;
attr.ia_valid = 0;
- BUG_ON(nd->intent.open.flags & O_CREAT);
+ BUG_ON(open_flags & O_CREAT);
}
cred = rpc_lookup_cred();
@@ -2054,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
+ state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
put_rpccred(cred);
if (IS_ERR(state)) {
if (PTR_ERR(state) == -ENOENT) {
@@ -2273,8 +2275,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
out:
if (page)
__free_page(page);
- if (locations)
- kfree(locations);
+ kfree(locations);
return status;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ee26316ad1f..ec3966e4706 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -655,6 +655,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (nfss->options & NFS_OPTION_FSCACHE)
seq_printf(m, ",fsc");
+
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ seq_printf(m, ",lookupcache=none");
+ else
+ seq_printf(m, ",lookupcache=pos");
+ }
}
/*
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a..95932f523ae 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,6 @@ config NFSD_V4
depends on NFSD && PROC_FS && EXPERIMENTAL
select NFSD_V3
select FS_POSIX_ACL
- select RPCSEC_GSS_KRB5
help
This option enables support in your system's NFS server for
version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e7357104cf..cf0d2ffb3c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
static int nfs4_access_to_omode(u32 access)
{
- switch (access) {
+ switch (access & NFS4_SHARE_ACCESS_BOTH) {
case NFS4_SHARE_ACCESS_READ:
return O_RDONLY;
case NFS4_SHARE_ACCESS_WRITE:
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
static __be32
nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
{
- u32 op_share_access, new_access;
+ u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+ bool new_access;
__be32 status;
- set_access(&new_access, stp->st_access_bmap);
- new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
-
+ new_access = !test_bit(op_share_access, &stp->st_access_bmap);
if (new_access) {
- status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
+ status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
if (status)
return status;
}
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
return status;
}
/* remember the open */
- op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
__set_bit(op_share_access, &stp->st_access_bmap);
__set_bit(open->op_share_deny, &stp->st_deny_bmap);
@@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
*filpp = find_readable_file(stp->st_file);
else
*filpp = find_writeable_file(stp->st_file);
- BUG_ON(!*filpp); /* assured by check_openmode */
}
}
status = nfs_ok;
@@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_stateowner *open_sop = NULL;
struct nfs4_stateowner *lock_sop = NULL;
struct nfs4_stateid *lock_stp;
- struct file *filp;
+ struct nfs4_file *fp;
+ struct file *filp = NULL;
struct file_lock file_lock;
struct file_lock conflock;
__be32 status = 0;
@@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* lock stateid.
*/
struct nfs4_stateid *open_stp = NULL;
- struct nfs4_file *fp;
status = nfserr_stale_clientid;
if (!nfsd4_has_session(cstate) &&
@@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
lock_sop = lock->lk_replay_owner;
+ fp = lock_stp->st_file;
}
/* lock->lk_replay_owner and lock_stp have been created or found */
@@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (lock->lk_type) {
case NFS4_READ_LT:
case NFS4_READW_LT:
- filp = find_readable_file(lock_stp->st_file);
+ if (find_readable_file(lock_stp->st_file)) {
+ nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+ filp = find_readable_file(lock_stp->st_file);
+ }
file_lock.fl_type = F_RDLCK;
cmd = F_SETLK;
break;
case NFS4_WRITE_LT:
case NFS4_WRITEW_LT:
- filp = find_writeable_file(lock_stp->st_file);
+ if (find_writeable_file(lock_stp->st_file)) {
+ nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+ filp = find_writeable_file(lock_stp->st_file);
+ }
file_lock.fl_type = F_WRLCK;
cmd = F_SETLK;
break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971d..322518c88e4 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,23 +363,23 @@ struct nfs4_file {
* at all? */
static inline struct file *find_writeable_file(struct nfs4_file *f)
{
- if (f->fi_fds[O_RDWR])
- return f->fi_fds[O_RDWR];
- return f->fi_fds[O_WRONLY];
+ if (f->fi_fds[O_WRONLY])
+ return f->fi_fds[O_WRONLY];
+ return f->fi_fds[O_RDWR];
}
static inline struct file *find_readable_file(struct nfs4_file *f)
{
- if (f->fi_fds[O_RDWR])
- return f->fi_fds[O_RDWR];
- return f->fi_fds[O_RDONLY];
+ if (f->fi_fds[O_RDONLY])
+ return f->fi_fds[O_RDONLY];
+ return f->fi_fds[O_RDWR];
}
static inline struct file *find_any_file(struct nfs4_file *f)
{
if (f->fi_fds[O_RDWR])
return f->fi_fds[O_RDWR];
- else if (f->fi_fds[O_RDWR])
+ else if (f->fi_fds[O_WRONLY])
return f->fi_fds[O_WRONLY];
else
return f->fi_fds[O_RDONLY];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb9..661a6cf8e82 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
__be32
nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{
- struct path path = {
- .mnt = fhp->fh_export->ex_path.mnt,
- .dentry = fhp->fh_dentry,
- };
__be32 err;
err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
- if (!err && vfs_statfs(&path, stat))
- err = nfserr_io;
+ if (!err) {
+ struct path path = {
+ .mnt = fhp->fh_export->ex_path.mnt,
+ .dentry = fhp->fh_dentry,
+ };
+ if (vfs_statfs(&path, stat))
+ err = nfserr_io;
+ }
return err;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fa86b9df73..922263393c7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
{
struct the_nilfs *nilfs = sbi->s_nilfs;
int err;
- int barrier_done = 0;
- if (nilfs_test_opt(sbi, BARRIER)) {
- set_buffer_ordered(nilfs->ns_sbh[0]);
- barrier_done = 1;
- }
retry:
set_buffer_dirty(nilfs->ns_sbh[0]);
- err = sync_dirty_buffer(nilfs->ns_sbh[0]);
- if (err == -EOPNOTSUPP && barrier_done) {
- nilfs_warning(sbi->s_super, __func__,
- "barrier-based sync failed. "
- "disabling barriers\n");
- nilfs_clear_opt(sbi, BARRIER);
- barrier_done = 0;
- clear_buffer_ordered(nilfs->ns_sbh[0]);
- goto retry;
+
+ if (nilfs_test_opt(sbi, BARRIER)) {
+ err = __sync_dirty_buffer(nilfs->ns_sbh[0],
+ WRITE_SYNC | WRITE_BARRIER);
+ if (err == -EOPNOTSUPP) {
+ nilfs_warning(sbi->s_super, __func__,
+ "barrier-based sync failed. "
+ "disabling barriers\n");
+ nilfs_clear_opt(sbi, BARRIER);
+ goto retry;
+ }
+ } else {
+ err = sync_dirty_buffer(nilfs->ns_sbh[0]);
}
+
if (unlikely(err)) {
printk(KERN_ERR
"NILFS: unable to write superblock (err=%d)\n", err);
@@ -400,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
list_add(&sbi->s_list, &nilfs->ns_supers);
up_write(&nilfs->ns_super_sem);
+ err = -ENOMEM;
sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
if (!sbi->s_ifile)
- return -ENOMEM;
+ goto delist;
down_read(&nilfs->ns_segctor_sem);
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -433,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
nilfs_mdt_destroy(sbi->s_ifile);
sbi->s_ifile = NULL;
+ delist:
down_write(&nilfs->ns_super_sem);
list_del_init(&sbi->s_list);
up_write(&nilfs->ns_super_sem);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 37de1f062d8..ba7c10c917f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
nilfs_mdt_destroy(nilfs->ns_cpfile);
nilfs_mdt_destroy(nilfs->ns_sufile);
nilfs_mdt_destroy(nilfs->ns_dat);
+ nilfs_mdt_destroy(nilfs->ns_gc_dat);
failed:
nilfs_clear_recovery_info(&ri);
@@ -608,11 +609,11 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
return -EINVAL;
}
- if (swp) {
+ if (!valid[!swp])
printk(KERN_WARNING "NILFS warning: broken superblock. "
"using spare superblock.\n");
+ if (swp)
nilfs_swap_super_block(nilfs);
- }
nilfs->ns_sbwcount = 0;
nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
@@ -775,6 +776,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
start * sects_per_block,
nblocks * sects_per_block,
GFP_NOFS,
+ BLKDEV_IFL_WAIT |
BLKDEV_IFL_BARRIER);
if (ret < 0)
return ret;
@@ -785,7 +787,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, BLKDEV_IFL_BARRIER);
+ GFP_NOFS,
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
return ret;
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index eb8f73c9c13..85366c78cc3 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,9 +17,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
old->data_type == new->data_type &&
old->tgid == new->tgid) {
switch (old->data_type) {
- case (FSNOTIFY_EVENT_FILE):
- if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
- (old->file->f_path.dentry == new->file->f_path.dentry))
+ case (FSNOTIFY_EVENT_PATH):
+ if ((old->path.mnt == new->path.mnt) &&
+ (old->path.dentry == new->path.dentry))
return true;
case (FSNOTIFY_EVENT_NONE):
return true;
@@ -165,16 +165,13 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
"mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
inode_mark, vfsmnt_mark, event_mask, data, data_type);
- pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
- __func__, group, vfsmnt_mark, inode_mark, event_mask);
-
/* sorry, fanotify only gives a damn about files and dirs */
if (!S_ISREG(to_tell->i_mode) &&
!S_ISDIR(to_tell->i_mode))
return false;
/* if we don't have enough info to send an event to userspace say no */
- if (data_type != FSNOTIFY_EVENT_FILE)
+ if (data_type != FSNOTIFY_EVENT_PATH)
return false;
if (inode_mark && vfsmnt_mark) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 25a3b4dfcf6..5ed8e58d7bf 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -65,7 +65,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
if (client_fd < 0)
return client_fd;
- if (event->data_type != FSNOTIFY_EVENT_FILE) {
+ if (event->data_type != FSNOTIFY_EVENT_PATH) {
WARN_ON(1);
put_unused_fd(client_fd);
return -EINVAL;
@@ -75,8 +75,8 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
* we need a new file handle for the userspace program so it can read even if it was
* originally opened O_WRONLY.
*/
- dentry = dget(event->file->f_path.dentry);
- mnt = mntget(event->file->f_path.mnt);
+ dentry = dget(event->path.dentry);
+ mnt = mntget(event->path.mnt);
/* it's possible this event was an overflow event. in that case dentry and mnt
* are NULL; That's fine, just don't call dentry open */
if (dentry && mnt)
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
re->fd = fd;
mutex_lock(&group->fanotify_data.access_mutex);
+
+ if (group->fanotify_data.bypass_perm) {
+ mutex_unlock(&group->fanotify_data.access_mutex);
+ kmem_cache_free(fanotify_response_event_cache, re);
+ event->response = FAN_ALLOW;
+ return 0;
+ }
+
list_add_tail(&re->list, &group->fanotify_data.access_list);
mutex_unlock(&group->fanotify_data.access_mutex);
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
+ struct fanotify_response_event *re, *lre;
pr_debug("%s: file=%p group=%p\n", __func__, file, group);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ mutex_lock(&group->fanotify_data.access_mutex);
+
+ group->fanotify_data.bypass_perm = true;
+
+ list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+ pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+ re, re->event);
+
+ list_del_init(&re->list);
+ re->event->response = FAN_ALLOW;
+
+ kmem_cache_free(fanotify_response_event_cache, re);
+ }
+ mutex_unlock(&group->fanotify_data.access_mutex);
+
+ wake_up(&group->fanotify_data.access_waitq);
+#endif
/* matches the fanotify_init->fsnotify_alloc_group */
fsnotify_put_group(group);
@@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
__func__, flags, event_f_flags);
if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return -EPERM;
if (flags & ~FAN_ALL_INIT_FLAGS)
return -EINVAL;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4d2a82c1ceb..36802420d69 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,7 +84,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
}
/* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
{
struct dentry *parent;
struct inode *p_inode;
@@ -92,7 +92,7 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
bool should_update_children = false;
if (!dentry)
- dentry = file->f_path.dentry;
+ dentry = path->dentry;
if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
return;
@@ -124,8 +124,8 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
* specifies these are events which came from a child. */
mask |= FS_EVENT_ON_CHILD;
- if (file)
- fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
+ if (path)
+ fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
dentry->d_name.name, 0);
else
fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
const unsigned char *file_name,
struct fsnotify_event **event)
{
- struct fsnotify_group *group = inode_mark->group;
- __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
- __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ struct fsnotify_group *group = NULL;
+ __u32 inode_test_mask = 0;
+ __u32 vfsmount_test_mask = 0;
- pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
- " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
- mnt, inode_mark, mask, data, data_is, cookie, *event);
+ if (unlikely(!inode_mark && !vfsmount_mark)) {
+ BUG();
+ return 0;
+ }
/* clear ignored on inode modification */
if (mask & FS_MODIFY) {
@@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
/* does the inode mark tell us to do something? */
if (inode_mark) {
+ group = inode_mark->group;
+ inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
inode_test_mask &= inode_mark->mask;
inode_test_mask &= ~inode_mark->ignored_mask;
}
/* does the vfsmount_mark tell us to do something? */
if (vfsmount_mark) {
+ vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ group = vfsmount_mark->group;
vfsmount_test_mask &= vfsmount_mark->mask;
vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
if (inode_mark)
vfsmount_test_mask &= ~inode_mark->ignored_mask;
}
+ pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+ " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+ " data=%p data_is=%d cookie=%d event=%p\n",
+ __func__, group, to_tell, mnt, mask, inode_mark,
+ inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+ data_is, cookie, *event);
+
if (!inode_test_mask && !vfsmount_test_mask)
return 0;
@@ -207,18 +219,17 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
- struct hlist_node *inode_node, *vfsmount_node;
+ struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
struct fsnotify_event *event = NULL;
struct vfsmount *mnt;
int idx, ret = 0;
- bool used_inode = false, used_vfsmount = false;
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
- if (data_is == FSNOTIFY_EVENT_FILE)
- mnt = ((struct file *)data)->f_path.mnt;
+ if (data_is == FSNOTIFY_EVENT_PATH)
+ mnt = ((struct path *)data)->mnt;
else
mnt = NULL;
@@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
(test_mask & to_tell->i_fsnotify_mask))
inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
&fsnotify_mark_srcu);
- else
- inode_node = NULL;
- if (mnt) {
- if ((mask & FS_MODIFY) ||
- (test_mask & mnt->mnt_fsnotify_mask))
- vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
- &fsnotify_mark_srcu);
- else
- vfsmount_node = NULL;
- } else {
- mnt = NULL;
- vfsmount_node = NULL;
+ if (mnt && ((mask & FS_MODIFY) ||
+ (test_mask & mnt->mnt_fsnotify_mask))) {
+ vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+ &fsnotify_mark_srcu);
+ inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ &fsnotify_mark_srcu);
}
while (inode_node || vfsmount_node) {
+ inode_group = vfsmount_group = NULL;
+
if (inode_node) {
inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
struct fsnotify_mark, i.i_list);
inode_group = inode_mark->group;
- } else
- inode_group = (void *)-1;
+ }
if (vfsmount_node) {
vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
struct fsnotify_mark, m.m_list);
vfsmount_group = vfsmount_mark->group;
- } else
- vfsmount_group = (void *)-1;
+ }
- if (inode_group < vfsmount_group) {
+ if (inode_group > vfsmount_group) {
/* handle inode */
send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
data_is, cookie, file_name, &event);
- used_inode = true;
- } else if (vfsmount_group < inode_group) {
+ /* we didn't use the vfsmount_mark */
+ vfsmount_group = NULL;
+ } else if (vfsmount_group > inode_group) {
send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
data_is, cookie, file_name, &event);
- used_vfsmount = true;
+ inode_group = NULL;
} else {
send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
mask, data, data_is, cookie, file_name,
&event);
- used_vfsmount = true;
- used_inode = true;
}
- if (used_inode)
+ if (inode_group)
inode_node = srcu_dereference(inode_node->next,
&fsnotify_mark_srcu);
- if (used_vfsmount)
+ if (vfsmount_group)
vfsmount_node = srcu_dereference(vfsmount_node->next,
&fsnotify_mark_srcu);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5e73eeb2c69..a91b69a6a29 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -52,9 +52,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
!strcmp(old->file_name, new->file_name))
return true;
break;
- case (FSNOTIFY_EVENT_FILE):
- if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
- (old->file->f_path.dentry == new->file->f_path.dentry))
+ case (FSNOTIFY_EVENT_PATH):
+ if ((old->path.mnt == new->path.mnt) &&
+ (old->path.dentry == new->path.dentry))
return true;
break;
case (FSNOTIFY_EVENT_NONE):
@@ -147,10 +147,10 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
__u32 mask, void *data, int data_type)
{
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
- (data_type == FSNOTIFY_EVENT_FILE)) {
- struct file *file = data;
+ (data_type == FSNOTIFY_EVENT_PATH)) {
+ struct path *path = data;
- if (d_unlinked(file->f_path.dentry))
+ if (d_unlinked(path->dentry))
return false;
}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index d6c435adc7a..f39260f8f86 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,7 +31,6 @@
* allocated and used.
*/
-#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -90,8 +89,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
if (atomic_dec_and_test(&event->refcnt)) {
pr_debug("%s: event=%p\n", __func__, event);
- if (event->data_type == FSNOTIFY_EVENT_FILE)
- fput(event->file);
+ if (event->data_type == FSNOTIFY_EVENT_PATH)
+ path_put(&event->path);
BUG_ON(!list_empty(&event->private_data_list));
@@ -376,8 +375,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
}
}
event->tgid = get_pid(old_event->tgid);
- if (event->data_type == FSNOTIFY_EVENT_FILE)
- get_file(event->file);
+ if (event->data_type == FSNOTIFY_EVENT_PATH)
+ path_get(&event->path);
return event;
}
@@ -424,22 +423,11 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
event->data_type = data_type;
switch (data_type) {
- case FSNOTIFY_EVENT_FILE: {
- event->file = data;
- /*
- * if this file is about to disappear hold an extra reference
- * until we return to __fput so we don't have to worry about
- * future get/put destroying the file under us or generating
- * additional events. Notice that we change f_mode without
- * holding f_lock. This is safe since this is the only possible
- * reference to this object in the kernel (it was about to be
- * freed, remember?)
- */
- if (!atomic_long_read(&event->file->f_count)) {
- event->file->f_mode |= FMODE_NONOTIFY;
- get_file(event->file);
- }
- get_file(event->file);
+ case FSNOTIFY_EVENT_PATH: {
+ struct path *path = data;
+ event->path.dentry = path->dentry;
+ event->path.mnt = path->mnt;
+ path_get(&event->path);
break;
}
case FSNOTIFY_EVENT_INODE:
@@ -447,7 +435,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
break;
case FSNOTIFY_EVENT_NONE:
event->inode = NULL;
- event->file = NULL;
+ event->path.dentry = NULL;
+ event->path.mnt = NULL;
break;
default:
BUG();
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e..a76e0aa5cd3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle,
int ocfs2_check_acl(struct inode *inode, int mask)
{
- struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret = -EAGAIN;
- if (IS_ERR(acl))
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return ret;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+
+ brelse(di_bh);
+
+ if (IS_ERR(acl)) {
+ mlog_errno(PTR_ERR(acl));
return PTR_ERR(acl);
+ }
if (acl) {
- int ret = posix_acl_permission(inode, acl, mask);
+ ret = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
return ret;
}
@@ -344,7 +362,7 @@ int ocfs2_init_acl(handle_t *handle,
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct posix_acl *acl = NULL;
- int ret = 0;
+ int ret = 0, ret2;
mode_t mode;
if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +399,12 @@ int ocfs2_init_acl(handle_t *handle,
mode = inode->i_mode;
ret = posix_acl_create_masq(clone, &mode);
if (ret >= 0) {
- ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
if (ret > 0) {
ret = ocfs2_set_acl(handle, inode,
di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d8..592fae5007d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
last_page_bytes = PAGE_ALIGN(end);
index = start >> PAGE_CACHE_SHIFT;
do {
- pages[numpages] = grab_cache_page(mapping, index);
+ pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7155c5a919d..5cfeee11815 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
* out in so that future reads from that region will get
* zero's.
*/
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
/*
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d1233959..c7ee03c2222 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
- "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
(unsigned int)check.bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
goto out;
}
- mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
(unsigned int)check.bc_crc32e, (unsigned int)crc);
rc = -EIO;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78d..1361997cf20 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1759,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock)
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
+ struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
@@ -1796,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock)
goto out;
}
- if (o2nm_this_node() > node->nd_num) {
- mlog(ML_NOTICE, "unexpected connect attempted from a lower "
- "numbered node '%s' at " "%pI4:%d with num %u\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port), node->nd_num);
+ if (o2nm_this_node() >= node->nd_num) {
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
+ "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1857,6 +1862,8 @@ out:
sock_release(new_sock);
if (node)
o2nm_node_put(node);
+ if (local_node)
+ o2nm_node_put(local_node);
if (sc)
sc_put(sc);
return ret;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88..ffb4c68dafa 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
atomic_dec(&dlm->res_cur_count);
- dlm_put(dlm);
-
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->migration_pending = 0;
res->inflight_locks = 0;
- /* put in dlm_lockres_release */
- dlm_grab(dlm);
res->dlm = dlm;
kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36..aaaffbcbe91 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
- if (res->state & DLM_LOCK_RES_DROPPING_REF)
- mlog(0, "%s:%.*s: owned by "
- "dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
- dlm->name, res->lockname.len,
- res->lockname.name, dead_node);
-
- /* the wake_up for this will happen when the
- * RECOVERING flag is dropped later */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "Ignore %.*s for "
+ "recovery as it is being freed\n",
+ res->lockname.len,
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
- dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe..2211acf33d9 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
- if (!__dlm_lockres_has_locks(res) &&
- (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
- /* try not to scan the bitmap unless the first two
- * conditions are already true */
- int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
- if (bit >= O2NM_MAX_NODES) {
- /* since the bit for dlm->node_num is not
- * set, inflight_locks better be zero */
- BUG_ON(res->inflight_locks != 0);
- return 1;
- }
- }
- return 0;
+ int bit;
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ /*
+ * since the bit for dlm->node_num is not set, inflight_locks better
+ * be zero
+ */
+ BUG_ON(res->inflight_locks != 0);
+ return 1;
}
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
- spin_lock(&res->spinlock);
- if (!__dlm_lockres_unused(res)) {
- mlog(0, "%s:%.*s: tried to purge but not unused\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- spin_unlock(&res->spinlock);
- BUG();
- }
-
- if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "%s:%.*s: Delay dropref as this lockres is "
- "being remastered\n", dlm->name, res->lockname.len,
- res->lockname.name);
- /* Re-add the lockres to the end of the purge list */
- if (!list_empty(&res->purge)) {
- list_del_init(&res->purge);
- list_add_tail(&res->purge, &dlm->purge_list);
- }
- spin_unlock(&res->spinlock);
- return 0;
- }
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
- if (!master)
- res->state |= DLM_LOCK_RES_DROPPING_REF;
- spin_unlock(&res->spinlock);
mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
res->lockname.name, master);
if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
- spin_lock(&res->spinlock);
if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purgelist, "
"master = %d\n", res->lockname.len, res->lockname.name,
res, master);
list_del_init(&res->purge);
- spin_unlock(&res->spinlock);
dlm_lockres_put(res);
dlm->purge_count--;
- } else
- spin_unlock(&res->spinlock);
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
__dlm_unhash_lockres(res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
- spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- }
- return 0;
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ mlog(0, "lockres %s:%.*s: is in use or "
+ "being remastered, used %d, state %d\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name, !unused, lockres->state);
+ list_move_tail(&dlm->purge_list, &lockres->purge);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- if (dlm_purge_lockres(dlm, lockres))
- BUG();
+ dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4331f57e9fd..9a74542e1a0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -63,12 +64,6 @@
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
-}
-
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
@@ -186,12 +181,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
dentry->d_name.len, dentry->d_name.name);
- err = ocfs2_sync_inode(dentry->d_inode);
- if (err)
- goto bail;
-
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ /*
+ * We still have to flush drive's caches to get data to the
+ * platter
+ */
+ if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+ NULL, BLKDEV_IFL_WAIT);
goto bail;
+ }
journal = osb->journal->j_journal;
err = jbd2_journal_force_commit(journal);
@@ -774,7 +773,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2306,17 +2305,6 @@ relock:
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
- /*
- * direct write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- * Don't need i_size_read because we hold i_mutex.
- *
- * XXX(truncate): this looks buggy because ocfs2 did not
- * actually implement ->truncate. Take a look at
- * the new truncate sequence and update this accordingly
- */
- if (*ppos + count > inode->i_size)
- truncate_setsize(inode, inode->i_size);
ret = written;
goto out_dio;
}
@@ -2332,7 +2320,7 @@ out_dio:
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- ((file->f_flags & O_DIRECT) && has_refcount)) {
+ ((file->f_flags & O_DIRECT) && !direct_io)) {
ret = filemap_fdatawrite_range(file->f_mapping, pos,
pos + count - 1);
if (ret < 0)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0492464916b..eece3e05d9d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
- if (!status)
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
status = ocfs2_validate_inode_block(osb->sb, bh);
}
if (status < 0) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db3..0bc477a3aeb 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,27 +46,24 @@ struct ocfs2_inode_info
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
struct list_head ip_io_markers;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
- u16 ip_dyn_features;
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- u32 ip_dir_start_lookup;
-
struct ocfs2_caching_info ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
+ u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132ce..7a486819615 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+ __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
@@ -109,6 +129,328 @@ bail:
return status;
}
+int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+ oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oib, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+ oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oic, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+ oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oim, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+ oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oil, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+ oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oiu, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+ oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oif, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+ oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oij, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
+ return status;
+}
+
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct reflink_arguments args;
const char *old_path, *new_path;
bool preserve;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
default:
return -ENOTTY;
}
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
bool preserve;
struct reflink_arguments args;
struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff..faa2303dbf0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
unsigned int flushed;
- unsigned long old_id;
struct ocfs2_journal *journal = NULL;
mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- old_id = ocfs2_inc_trans_id(journal);
+ ocfs2_inc_trans_id(journal);
flushed = atomic_read(&journal->j_num_trans);
atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
return status;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
os = &osb->osb_orphan_scan;
+ mlog(0, "Begin orphan scan\n");
+
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
goto out;
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
+ mlog(0, "Orphan scan completed\n");
return;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710..43e56b97f9c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b04d6961c0d..7e32db9c2c9 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -75,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
/*
* Another node might have truncated while we were waiting on
* cluster locks.
+ * We don't check size == 0 before the shift. This is borrowed
+ * from do_generic_file_read.
*/
- last_index = size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index) {
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!size || page->index > last_index)) {
ret = -EINVAL;
goto out;
}
@@ -108,7 +110,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = size & ~PAGE_CACHE_MASK;
+ len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f..a00dda2e4f1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -472,32 +472,23 @@ leave:
return status;
}
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
- struct inode *dir,
- struct inode *inode,
- dev_t dev,
- struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac,
+ u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 suballoc_loc, fe_blkno = 0;
- u16 suballoc_bit;
u16 feat;
*new_fe_bh = NULL;
- status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
- inode_ac, &suballoc_loc,
- &suballoc_bit, &fe_blkno);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
@@ -591,6 +582,34 @@ leave:
return status;
}
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac)
+{
+ int status = 0;
+ u64 suballoc_loc, fe_blkno = 0;
+ u16 suballoc_bit;
+
+ *new_fe_bh = NULL;
+
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ parent_fe_bh, handle, inode_ac,
+ fe_blkno, suballoc_loc, suballoc_bit);
+}
+
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
int mode)
@@ -1852,61 +1871,117 @@ bail:
return status;
}
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct inode **ret_orphan_dir,
- u64 blkno,
- char *name,
- struct ocfs2_dir_lookup_result *lookup)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ struct buffer_head **ret_orphan_dir_bh)
{
struct inode *orphan_dir_inode;
struct buffer_head *orphan_dir_bh = NULL;
- int status = 0;
-
- status = ocfs2_blkno_stringify(blkno, name);
- if (status < 0) {
- mlog_errno(status);
- return status;
- }
+ int ret = 0;
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -ENOENT;
- mlog_errno(status);
- return status;
+ ret = -ENOENT;
+ mlog_errno(ret);
+ return ret;
}
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (ret < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ mlog_errno(ret);
+ return ret;
}
- status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
- orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
- if (status < 0) {
- ocfs2_inode_unlock(orphan_dir_inode, 1);
+ *ret_orphan_dir = orphan_dir_inode;
+ *ret_orphan_dir_bh = orphan_dir_bh;
- mlog_errno(status);
- goto leave;
+ return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+ struct buffer_head *orphan_dir_bh,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+ ret = ocfs2_blkno_stringify(blkno, name);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+ orphan_dir_bh, name,
+ OCFS2_ORPHAN_NAMELEN, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ * ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+ &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+ blkno, name, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
*ret_orphan_dir = orphan_dir_inode;
-leave:
- if (status) {
+out:
+ brelse(orphan_dir_bh);
+
+ if (ret) {
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
}
- brelse(orphan_dir_bh);
-
- mlog_exit(status);
- return status;
+ mlog_exit(ret);
+ return ret;
}
static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2128,99 @@ leave:
return status;
}
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+ struct buffer_head *dir_bh,
+ char *orphan_name,
+ struct inode **ret_orphan_dir,
+ u64 *ret_di_blkno,
+ struct ocfs2_dir_lookup_result *orphan_insert,
+ struct ocfs2_alloc_context **ret_inode_ac)
+{
+ int ret;
+ u64 di_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct inode *orphan_dir = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* reserve an inode spot */
+ ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+ &di_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+ di_blkno, orphan_name, orphan_insert);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *ret_orphan_dir = orphan_dir;
+ *ret_di_blkno = di_blkno;
+ *ret_inode_ac = inode_ac;
+ /*
+ * orphan_name and orphan_insert are already up to
+ * date via prepare_orphan_dir
+ */
+ } else {
+ /* Unroll reserve_new_inode* */
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ /* Unroll orphan dir locking */
+ mutex_unlock(&orphan_dir->i_mutex);
+ ocfs2_inode_unlock(orphan_dir, 1);
+ iput(orphan_dir);
+ }
+
+ brelse(orphan_dir_bh);
+
+ return 0;
+}
+
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode)
@@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ u64 uninitialized_var(di_blkno), suballoc_loc;
+ u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
@@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
return status;
}
- /*
- * We give the orphan dir the root blkno to fake an orphan name,
- * and allocate enough space for our insertion.
- */
- status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
- osb->root_blkno,
- orphan_name, &orphan_insert);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- /* reserve an inode spot */
- status = ocfs2_reserve_new_inode(osb, &inode_ac);
+ status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+ orphan_name, &orphan_dir,
+ &di_blkno, &orphan_insert, &inode_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
did_quota_inode = 1;
- inode->i_nlink = 0;
- /* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, inode,
- 0, &new_di_bh, parent_di_bh, handle,
- inode_ac);
+ status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+ &suballoc_loc,
+ &suballoc_bit, di_blkno);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+ inode->i_nlink = 0;
+ /* do the real work now. */
+ status = __ocfs2_mknod_locked(dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac, di_blkno, suballoc_loc,
+ suballoc_bit);
if (status < 0) {
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a..65739b3b327 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct ocfs2_dlm_lksb l_lksb;
+ unsigned char l_level;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
+ unsigned char l_requested;
+ unsigned char l_blocking;
unsigned int l_pending_gen;
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
+
wait_queue_head_t l_event;
struct list_head l_debug_list;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a8..9bc53549986 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
};
#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 47549f64224..a120cfcf69b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2437,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
/*
- * If the refcount rec already exist, cool. We just need
- * to check whether there is a split. Otherwise we just need
- * to increase the refcount.
- * If we will insert one, increases recs_add.
- *
* We record all the records which will be inserted to the
* same refcount block, so that we can tell exactly whether
* we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
*/
if (rec.r_refcount) {
+ recs_add += 2;
/* Check whether we need a split at the beginning. */
if (cpos == start_cpos &&
cpos != le64_to_cpu(rec.r_cpos))
@@ -2954,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_CACHE_SIZE - 1))
to = map_end & (PAGE_CACHE_SIZE - 1);
- page = grab_cache_page(mapping, page_index);
+ page = find_or_create_page(mapping, page_index, GFP_NOFS);
/*
* In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -3181,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
- page = grab_cache_page(context->inode->i_mapping, page_index);
+ page = find_or_create_page(context->inode->i_mapping,
+ page_index, GFP_NOFS);
BUG_ON(!page);
wait_on_page_writeback(page);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 29cba0eaa92..c8ce46f7d8e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
struct rb_node rf_node;
u64 rf_blkno;
u32 rf_generation;
+ struct kref rf_getcnt;
struct rw_semaphore rf_sem;
struct ocfs2_lock_res rf_lockres;
- struct kref rf_getcnt;
int rf_removed;
/* the following 4 fields are used by caching_info. */
- struct ocfs2_caching_info rf_ci;
spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
struct mutex rf_io_mutex;
struct super_block *rf_sb;
};
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353..8a286f54dca 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
u64 sr_bg_blkno; /* The bg we allocated from. Set
to 0 when a block group is
contiguous. */
+ u64 sr_bg_stable_blkno; /*
+ * Doesn't change, always
+ * set to target block
+ * group descriptor
+ * block.
+ */
u64 sr_blkno; /* The first allocated block */
unsigned int sr_bit_offset; /* The bit in the bg */
unsigned int sr_bits; /* How many bits we claimed */
};
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+ if (res->sr_blkno == 0)
+ return 0;
+
+ if (res->sr_bg_blkno)
+ return res->sr_bg_blkno;
+
+ return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
brelse(ac->ac_bh);
ac->ac_bh = NULL;
ac->ac_resv = NULL;
+ if (ac->ac_find_loc_priv) {
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
+ }
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
if (!ret)
ocfs2_bg_discontig_fix_result(ac, gd, res);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
+
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
res->sr_bits,
le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
if (ret < 0)
mlog_errno(ret);
+out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
out:
@@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
{
int status;
u16 chain;
- u32 tmp_used;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
if (!status)
ocfs2_bg_discontig_fix_result(ac, bg, res);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
/*
* Keep track of previous block descriptor read. When
@@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
}
- /* Ok, claim our bits now: set the info on dinode, chainlist
- * and then the group */
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(alloc_inode),
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
+
+ status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (status) {
mlog_errno(status);
goto bail;
}
- tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
- ocfs2_journal_dirty(handle, ac->ac_bh);
-
status = ocfs2_block_group_set_bits(handle,
alloc_inode,
bg,
@@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
(unsigned long long)le64_to_cpu(fe->i_blkno));
+out_loc_only:
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
brelse(group_bh);
@@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
int status;
u16 victim, i;
u16 bits_left = 0;
+ u64 hint = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
@@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
goto bail;
}
- res->sr_bg_blkno = ac->ac_last_group;
+ res->sr_bg_blkno = hint;
if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
@@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
- if (!status)
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
goto set_hint;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
ac->ac_chain = i;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
- if (!status)
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
break;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1936,7 +1972,7 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = res->sr_bg_blkno;
+ ac->ac_last_group = hint;
}
bail:
@@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
}
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_suballoc_result *res;
+
+ BUG_ON(!ac);
+ BUG_ON(ac->ac_bits_given != 0);
+ BUG_ON(ac->ac_bits_wanted != 1);
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+ res = kzalloc(sizeof(*res), GFP_NOFS);
+ if (res == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ /*
+ * The handle started here is for chain relink. Alternatively,
+ * we could just disable relink for these calls.
+ */
+ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This will instruct ocfs2_claim_suballoc_bits and
+ * ocfs2_search_one_group to search but save actual allocation
+ * for later.
+ */
+ ac->ac_find_loc_only = 1;
+
+ ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac->ac_find_loc_priv = res;
+ *fe_blkno = res->sr_blkno;
+
+out:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+ if (ret)
+ kfree(res);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno)
+{
+ int ret;
+ u16 chain;
+ struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+ /*
+ * Since di_blkno is being passed back in, we check for any
+ * inconsistencies which may have happened between
+ * calls. These are code bugs as di_blkno is not expected to
+ * change once returned from ocfs2_find_new_inode_loc()
+ */
+ BUG_ON(res->sr_blkno != di_blkno);
+
+ ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+ res->sr_bg_stable_blkno, &bg_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ chain = le16_to_cpu(bg->bg_chain);
+
+ ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle,
+ ac->ac_inode,
+ bg,
+ bg_bh,
+ res->sr_bit_offset,
+ res->sr_bits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+ (unsigned long long)di_blkno);
+
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+ BUG_ON(res->sr_bits != 1);
+
+ *suballoc_loc = res->sr_bg_blkno;
+ *suballoc_bit = res->sr_bit_offset;
+ ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+ brelse(bg_bh);
+
+ return ret;
+}
+
int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
@@ -2567,7 +2733,8 @@ out:
* suballoc_bit.
*/
static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
- u16 *suballoc_slot, u16 *suballoc_bit)
+ u16 *suballoc_slot, u64 *group_blkno,
+ u16 *suballoc_bit)
{
int status;
struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
if (suballoc_bit)
*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+ if (group_blkno)
+ *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
bail:
brelse(inode_bh);
@@ -2621,7 +2790,8 @@ bail:
*/
static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
struct inode *suballoc,
- struct buffer_head *alloc_bh, u64 blkno,
+ struct buffer_head *alloc_bh,
+ u64 group_blkno, u64 blkno,
u16 bit, int *res)
{
struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
goto bail;
}
- if (alloc_di->i_suballoc_loc)
- bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
- else
- bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+ bg_blkno = group_blkno ? group_blkno :
+ ocfs2_which_suballoc_group(blkno, bit);
status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
&group_bh);
if (status < 0) {
@@ -2680,6 +2848,7 @@ bail:
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
{
int status;
+ u64 group_blkno = 0;
u16 suballoc_bit = 0, suballoc_slot = 0;
struct inode *inode_alloc_inode;
struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
mlog_entry("blkno: %llu", (unsigned long long)blkno);
status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
- &suballoc_bit);
+ &group_blkno, &suballoc_bit);
if (status < 0) {
mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
goto bail;
@@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
}
status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
- blkno, suballoc_bit, res);
+ group_blkno, blkno, suballoc_bit, res);
if (status < 0)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d..b8afabfeede 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+ int ac_find_loc_only; /* hack for reflink operation ordering */
+ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
struct ocfs2_alloc_reservation *ac_resv;
};
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno);
+
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/open.c b/fs/open.c
index 630715f9f73..d74e1983e8d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop);
- file_move(f, &inode->i_sb->s_files);
+ file_sb_list_add(f, inode->i_sb);
error = security_dentry_open(f, cred);
if (error)
@@ -721,7 +721,7 @@ cleanup_all:
mnt_drop_write(mnt);
}
}
- file_kill(f);
+ file_sb_list_del(f);
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
cleanup_file:
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index d1b8a5c4bc0..d513a07f44b 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -182,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state)
offset = (info->label_block + 1);
} else {
/* unlabeled disk */
- strlcat(tmp, sizeof(tmp), "(nonl)", PAGE_SIZE);
+ strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
size = i_size >> 9;
offset = (info->label_block + 1);
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a8314..8066b8dd748 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
return 0;
}
+/*
+ * vfsmount lock must be held for write
+ */
void change_mnt_propagation(struct vfsmount *mnt, int type)
{
if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
prev_src_mnt = child;
}
out:
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
while (!list_empty(&tmp_list)) {
child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
umount_tree(child, 0, &umount_list);
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
release_mounts(&umount_list);
return ret;
}
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
* other mounts its parent propagates to.
* Check if any of these mounts that **do not have submounts**
* have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
*/
int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
{
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
* collect all mounts that receive propagation from the mount in @list,
* and return these additional mounts in the same list.
* @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
*/
int propagate_umount(struct list_head *list)
{
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 23561cda724..9c2b5f48487 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
{
struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
long rv = -ENOTTY;
- long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
- int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+ long (*ioctl)(struct file *, unsigned int, unsigned long);
spin_lock(&pde->pde_unload_lock);
if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
return rv;
}
pde->pde_users++;
- unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
- ioctl = pde->proc_fops->ioctl;
+ ioctl = pde->proc_fops->unlocked_ioctl;
spin_unlock(&pde->pde_unload_lock);
- if (unlocked_ioctl) {
- rv = unlocked_ioctl(file, cmd, arg);
- if (rv == -ENOIOCTLCMD)
- rv = -EINVAL;
- } else if (ioctl) {
- WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
- "%pf will be called without the Bkl held\n", ioctl);
- rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
- }
+ if (ioctl)
+ rv = ioctl(file, cmd, arg);
pde_users_dec(pde);
return rv;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea1d3f1ffb..271afc48b9a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
int flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
+ unsigned long start;
dev_t dev = 0;
int len;
@@ -220,8 +221,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
}
+ /* We don't show the stack guard page in /proc/maps */
+ start = vma->vm_start;
+ if (vma->vm_flags & VM_GROWSDOWN)
+ if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
+ start += PAGE_SIZE;
+
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
- vma->vm_start,
+ start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae35413dcbe..caa758377d6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode)
dquot_drop(inode);
inode->i_blocks = 0;
reiserfs_write_unlock_once(inode->i_sb, depth);
+ return;
no_delete:
end_writeback(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1ec952b1f03..812e2c05aa2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
/* flush out the real blocks */
for (i = 0; i < get_desc_trans_len(desc); i++) {
set_buffer_dirty(real_blocks[i]);
- ll_rw_block(SWRITE, 1, real_blocks + i);
+ write_dirty_buffer(real_blocks[i], WRITE);
}
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(real_blocks[i]);
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c573..12e90e21390 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
}
EXPORT_SYMBOL(vfs_fstat);
-int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+ int flag)
{
struct path path;
int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
}
EXPORT_SYMBOL(vfs_fstatat);
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat(const char __user *name, struct kstat *stat)
{
return vfs_fstatat(AT_FDCWD, name, stat, 0);
}
EXPORT_SYMBOL(vfs_stat);
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat(const char __user *name, struct kstat *stat)
{
return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
-SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+ struct __old_kernel_stat __user *, statbuf)
{
struct kstat stat;
int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
return cp_old_stat(&stat, statbuf);
}
-SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+ struct __old_kernel_stat __user *, statbuf)
{
struct kstat stat;
int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
-SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+ struct stat __user *, statbuf)
{
struct kstat stat;
int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
return cp_new_stat(&stat, statbuf);
}
-SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+ struct stat __user *, statbuf)
{
struct kstat stat;
int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
}
#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
struct stat __user *, statbuf, int, flag)
{
struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
-SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
{
struct kstat stat;
int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
return error;
}
-SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
{
struct kstat stat;
int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
return error;
}
-SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
struct stat64 __user *, statbuf, int, flag)
{
struct kstat stat;
diff --git a/fs/super.c b/fs/super.c
index 9674ab2c871..8819e3a7ff2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
s = NULL;
goto out;
}
+#ifdef CONFIG_SMP
+ s->s_files = alloc_percpu(struct list_head);
+ if (!s->s_files) {
+ security_sb_free(s);
+ kfree(s);
+ s = NULL;
+ goto out;
+ } else {
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+ }
+#else
INIT_LIST_HEAD(&s->s_files);
+#endif
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
*/
static inline void destroy_super(struct super_block *s)
{
+#ifdef CONFIG_SMP
+ free_percpu(s->s_files);
+#endif
security_sb_free(s);
kfree(s->s_subtype);
kfree(s->s_options);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b27b5688f6..da3fefe91a8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
char *p;
p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
- if (p)
+ if (!IS_ERR(p))
memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d..46f7a807bbc 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
sb->s_dirt = 1;
unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
if (overflow) {
fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
sb->s_dirt = 1;
UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
succed:
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
sb->s_dirt = 1;
result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 428017e018f..2eabf04af3d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
sb->s_dirt = 1;
unlock_super (sb);
@@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer(UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
UFSD("EXIT\n");
}
@@ -290,10 +286,8 @@ cg_found:
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
- if (sb->s_flags & MS_SYNCHRONOUS) {
- ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
- ubh_wait_on_buffer (UCPI_UBH(ucpi));
- }
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ ubh_sync_block(UCPI_UBH(ucpi));
sb->s_dirt = 1;
inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 34d5cb13532..a58f9155fc9 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
ubh_bforget(ind_ubh);
ind_ubh = NULL;
}
- if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
- ubh_ll_rw_block(SWRITE, ind_ubh);
- ubh_wait_on_buffer (ind_ubh);
- }
+ if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
+ ubh_sync_block(ind_ubh);
ubh_brelse (ind_ubh);
UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
ubh_bforget(dind_bh);
dind_bh = NULL;
}
- if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
- ubh_ll_rw_block(SWRITE, dind_bh);
- ubh_wait_on_buffer (dind_bh);
- }
+ if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
+ ubh_sync_block(dind_bh);
ubh_brelse (dind_bh);
UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
ubh_bforget(tind_bh);
tind_bh = NULL;
}
- if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
- ubh_ll_rw_block(SWRITE, tind_bh);
- ubh_wait_on_buffer (tind_bh);
- }
+ if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
+ ubh_sync_block(tind_bh);
ubh_brelse (tind_bh);
UFSD("EXIT: ino %lu\n", inode->i_ino);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4..d2c36d53fe6 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
}
}
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
{
- if (!ubh)
- return;
+ if (ubh) {
+ unsigned i;
- ll_rw_block(rw, ubh->count, ubh->bh);
-}
+ for (i = 0; i < ubh->count; i++)
+ write_dirty_buffer(ubh->bh[i], WRITE);
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
-{
- unsigned i;
- if (!ubh)
- return;
- for ( i = 0; i < ubh->count; i++ )
- wait_on_buffer (ubh->bh[i]);
+ for (i = 0; i < ubh->count; i++)
+ wait_on_buffer(ubh->bh[i]);
+ }
}
void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0466036912f..9f8775ce381 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
extern void ubh_bforget (struct ufs_buffer_head *);
extern int ubh_buffer_dirty (struct ufs_buffer_head *);
#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d37..179b5869065 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
* must be owner or have write permission.
* Else, update from *times, must be owner or super user.
*/
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+ int flags)
{
int error = -EINVAL;
@@ -170,7 +171,7 @@ out:
return error;
}
-SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
struct timespec __user *, utimes, int, flags)
{
struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
}
-SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
struct timeval __user *, utimes)
{
struct timeval times[2];
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3..b552f816de1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
SetPageUptodate(page);
if (count) {
- wbc->nr_to_write--;
- if (wbc->nr_to_write <= 0)
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE)
done = 1;
}
xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
* by themselves.
*/
if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
- goto out_fail;
+ goto redirty;
/*
* We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
*/
xfs_count_page_state(page, &delalloc, &unwritten);
if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
- goto out_fail;
+ goto redirty;
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
if (iohead)
xfs_cancel_ioend(iohead);
+ if (err == -EAGAIN)
+ goto redirty;
+
xfs_aops_discard_page(page);
ClearPageUptodate(page);
unlock_page(page);
return err;
-out_fail:
+redirty:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ea79072f521..d72cf2bb054 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
ASSERT(btp == bp->b_target);
if (bp->b_file_offset == range_base &&
bp->b_buffer_length == range_length) {
- /*
- * If we look at something, bring it to the
- * front of the list for next time.
- */
atomic_inc(&bp->b_hold);
- list_move(&bp->b_hash_list, &hash->bh_list);
goto found;
}
}
@@ -1443,8 +1438,7 @@ xfs_alloc_bufhash(
{
unsigned int i;
- btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
- btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
+ btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
sizeof(xfs_bufhash_t));
for (i = 0; i < (1 << btp->bt_hashshift); i++) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d072e5ff923..2a05614f0b9 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -137,7 +137,6 @@ typedef struct xfs_buftarg {
size_t bt_smask;
/* per device buffer hash table */
- uint bt_hashmask;
uint bt_hashshift;
xfs_bufhash_t *bt_hash;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee..4fec427b83e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -907,6 +907,13 @@ xfs_ioctl_setattr(
return XFS_ERROR(EIO);
/*
+ * Disallow 32bit project ids because on-disk structure
+ * is 16bit only.
+ */
+ if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+ return XFS_ERROR(EINVAL);
+
+ /*
* If disk quotas is on, we make sure that the dquots do exist on disk,
* before we start any other transactions. Trying to do this later
* is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd30..b1fc2a6bfe8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
fieinfo->fi_extents_max + 1;
bm.bmv_count = min_t(__s32, bm.bmv_count,
(PAGE_SIZE * 16 / sizeof(struct getbmapx)));
- bm.bmv_iflags = BMV_IF_PREALLOC;
+ bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
bm.bmv_iflags |= BMV_IF_ATTRFORK;
if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff1..a4e07974955 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1226,6 +1226,7 @@ xfs_fs_statfs(
struct xfs_inode *ip = XFS_I(dentry->d_inode);
__uint64_t fakeinos, id;
xfs_extlen_t lsize;
+ __int64_t ffree;
statp->f_type = XFS_SB_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -1249,7 +1250,11 @@ xfs_fs_statfs(
statp->f_files = min_t(typeof(statp->f_files),
statp->f_files,
mp->m_maxicount);
- statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+
+ /* make sure statp->f_ffree does not underflow */
+ ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+ statp->f_ffree = max_t(__int64_t, ffree, 0);
+
spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1402,7 +1407,7 @@ xfs_fs_freeze(
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
- return -xfs_fs_log_dummy(mp);
+ return -xfs_fs_log_dummy(mp, SYNC_WAIT);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d159..d59c4a65d49 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
#include "xfs_inode_item.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
+#include "xfs_fsops.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
}
STATIC int
-xfs_commit_dummy_trans(
- struct xfs_mount *mp,
- uint flags)
-{
- struct xfs_inode *ip = mp->m_rootip;
- struct xfs_trans *tp;
- int error;
-
- /*
- * Put a dummy transaction in the log to tell recovery
- * that all others are OK.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
- error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- /* the log force ensures this transaction is pushed to disk */
- xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
- return error;
-}
-
-STATIC int
xfs_sync_fsdata(
struct xfs_mount *mp)
{
@@ -432,7 +401,7 @@ xfs_quiesce_data(
/* mark the log as covered if needed */
if (xfs_log_need_covered(mp))
- error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+ error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
/*
* Every sync period we need to unpin all items, reclaim inodes and sync
* disk quotas. We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
*/
STATIC void
xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
xfs_reclaim_inodes(mp, 0);
/* dgc: errors ignored here */
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
- if (xfs_log_need_covered(mp))
- error = xfs_commit_dummy_trans(mp, 0);
+ if (mp->m_super->s_frozen == SB_UNFROZEN &&
+ xfs_log_need_covered(mp))
+ error = xfs_fs_log_dummy(mp, 0);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c1..f90dadd5a96 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
map[i].br_startblock))
goto out_free_map;
- nexleft--;
bmv->bmv_offset =
out[cur_ext].bmv_offset +
out[cur_ext].bmv_length;
bmv->bmv_length =
max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+ /*
+ * In case we don't want to return the hole,
+ * don't increase cur_ext so that we can reuse
+ * it in the next loop.
+ */
+ if ((iflags & BMV_IF_NO_HOLES) &&
+ map[i].br_startblock == HOLESTARTBLOCK) {
+ memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+ continue;
+ }
+
+ nexleft--;
bmv->bmv_entries++;
cur_ext++;
}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5..87c2e9d0228 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
#define BMV_IF_VALID \
- (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37b..43b1d569933 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
return 0;
}
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
int
xfs_fs_log_dummy(
- xfs_mount_t *mp)
+ xfs_mount_t *mp,
+ int flags)
{
xfs_trans_t *tp;
- xfs_inode_t *ip;
int error;
tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
- error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
- ip = mp->m_rootip;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
+ /* log the UUID because it is an unchanging field */
+ xfs_mod_sb(tp, XFS_SB_UUID);
+ if (flags & SYNC_WAIT)
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp, 0);
}
int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c..a786c5212c1 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95..5371d2dc360 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
- xfs_agino_t startino;
int error;
int i;
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
}
/*
- * derive and lookup the exact inode record for the given agino. If the
- * record cannot be found, then it's an invalid inode number and we
- * should abort.
+ * Lookup the inode record for the given agino. If the record cannot be
+ * found, then it's an invalid inode number and we should abort. Once
+ * we have a record, we need to ensure it contains the inode number
+ * we are looking up.
*/
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
- startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
- error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
if (error)
return error;
+ /* check that the returned record contains the required inode */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+ return EINVAL;
+
/* for untrusted inodes check it is allocated first */
if ((flags & XFS_IGET_UNTRUSTED) &&
(rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23..34798f391c4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
return 0;
}
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
STATIC void
xfs_ifree_cluster(
xfs_inode_t *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
}
for (j = 0; j < nbufs; j++, inum += ninodes) {
- int found = 0;
-
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
/*
* Walk the inodes already attached to the buffer and mark them
* stale. These will all have the flush locks held, so an
- * in-memory inode walk can't lock them.
+ * in-memory inode walk can't lock them. By marking them all
+ * stale first, we will not attempt to lock them in the loop
+ * below as the XFS_ISTALE flag will be set.
*/
lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
&iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- found++;
}
lip = lip->li_bio_list;
}
+
/*
* For each inode in memory attempt to add it to the inode
* buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < ninodes; i++) {
+retry:
read_lock(&pag->pag_ici_lock);
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
continue;
}
- /* don't try to lock/unlock the current inode */
+ /*
+ * Don't try to lock/unlock the current inode, but we
+ * _cannot_ skip the other inodes that we did not find
+ * in the list attached to the buffer and are not
+ * already marked stale. If we can't lock it, back off
+ * and retry.
+ */
if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
read_unlock(&pag->pag_ici_lock);
- continue;
+ delay(1);
+ goto retry;
}
read_unlock(&pag->pag_ici_lock);
- if (!xfs_iflock_nowait(ip)) {
- if (ip != free_ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
+ xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
- if (xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
+ /*
+ * we don't need to attach clean inodes or those only
+ * with unlogged changes (which we throw away, anyway).
+ */
iip = ip->i_itemp;
- if (!iip) {
- /* inode with unlogged changes only */
+ if (!iip || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
ip->i_update_core = 0;
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
- found++;
iip->ili_last_fields = iip->ili_format.ilf_fields;
iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- if (found)
- xfs_trans_stale_inode_buf(tp, bp);
+ xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f..33f718f92a4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3015,7 +3015,8 @@ _xfs_log_force(
XFS_STATS_INC(xs_log_force);
- xlog_cil_push(log, 1);
+ if (log->l_cilp)
+ xlog_cil_force(log);
spin_lock(&log->l_icloglock);
@@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(
XFS_STATS_INC(xs_log_force);
if (log->l_cilp) {
- lsn = xlog_cil_push_lsn(log, lsn);
+ lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
return 0;
}
@@ -3724,7 +3725,7 @@ xfs_log_force_umount(
* call below.
*/
if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
- xlog_cil_push(log, 1);
+ xlog_cil_force(log);
/*
* We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19a..ed575fb4b49 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
+ cil->xc_current_sequence = ctx->sequence;
cil->xc_log = log;
log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
static void
xlog_cil_format_items(
struct log *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn)
+ struct xfs_log_vec *log_vector)
{
struct xfs_log_vec *lv;
- if (start_lsn)
- *start_lsn = log->l_cilp->xc_ctx->sequence;
-
ASSERT(log_vector);
for (lv = log_vector; lv; lv = lv->lv_next) {
void *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
ptr += vec->i_len;
}
ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+ }
+}
+
+static void
+xlog_cil_insert_items(
+ struct log *log,
+ struct xfs_log_vec *log_vector,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *start_lsn)
+{
+ struct xfs_log_vec *lv;
+
+ if (start_lsn)
+ *start_lsn = log->l_cilp->xc_ctx->sequence;
+ ASSERT(log_vector);
+ for (lv = log_vector; lv; lv = lv->lv_next)
xlog_cil_insert(log, ticket, lv->lv_item, lv);
- }
}
static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
}
/*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
- */
-int
-xfs_log_commit_cil(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_log_vec *log_vector,
- xfs_lsn_t *commit_lsn,
- int flags)
-{
- struct log *log = mp->m_log;
- int log_flags = 0;
- int push = 0;
-
- if (flags & XFS_TRANS_RELEASE_LOG_RES)
- log_flags = XFS_LOG_REL_PERM_RESERV;
-
- if (XLOG_FORCED_SHUTDOWN(log)) {
- xlog_cil_free_logvec(log_vector);
- return XFS_ERROR(EIO);
- }
-
- /* lock out background commit */
- down_read(&log->l_cilp->xc_ctx_lock);
- xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
-
- /* check we didn't blow the reservation */
- if (tp->t_ticket->t_curr_res < 0)
- xlog_print_tic_res(log->l_mp, tp->t_ticket);
-
- /* attach the transaction to the CIL if it has any busy extents */
- if (!list_empty(&tp->t_busy)) {
- spin_lock(&log->l_cilp->xc_cil_lock);
- list_splice_init(&tp->t_busy,
- &log->l_cilp->xc_ctx->busy_extents);
- spin_unlock(&log->l_cilp->xc_cil_lock);
- }
-
- tp->t_commit_lsn = *commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
- xfs_trans_unreserve_and_mod_sb(tp);
-
- /* check for background commit before unlock */
- if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
- push = 1;
- up_read(&log->l_cilp->xc_ctx_lock);
-
- /*
- * We need to push CIL every so often so we don't cache more than we
- * can fit in the log. The limit really is that a checkpoint can't be
- * more than half the log (the current checkpoint is not allowed to
- * overwrite the previous checkpoint), but commit latency and memory
- * usage limit this to a smaller size in most cases.
- */
- if (push)
- xlog_cil_push(log, 0);
- return 0;
-}
-
-/*
* Mark all items committed and clear busy extents. We free the log vector
* chains in a separate pass so that we unpin the log items as quickly as
* possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
}
/*
- * Push the Committed Item List to the log. If the push_now flag is not set,
- * then it is a background flush and so we can chose to ignore it.
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
*/
-int
+STATIC int
xlog_cil_push(
struct log *log,
- int push_now)
+ xfs_lsn_t push_seq)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_vec *lv;
@@ -453,12 +400,14 @@ xlog_cil_push(
if (!cil)
return 0;
+ ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
/* lock out transaction commit, but don't block on background push */
if (!down_write_trylock(&cil->xc_ctx_lock)) {
- if (!push_now)
+ if (!push_seq)
goto out_free_ticket;
down_write(&cil->xc_ctx_lock);
}
@@ -469,7 +418,11 @@ xlog_cil_push(
goto out_skip;
/* check for spurious background flush */
- if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ goto out_skip;
+
+ /* check for a previously pushed seqeunce */
+ if (push_seq < cil->xc_ctx->sequence)
goto out_skip;
/*
@@ -515,6 +468,13 @@ xlog_cil_push(
cil->xc_ctx = new_ctx;
/*
+ * mirror the new sequence into the cil structure so that we can do
+ * unlocked checks against the current sequence in log forces without
+ * risking deferencing a freed context pointer.
+ */
+ cil->xc_current_sequence = new_ctx->sequence;
+
+ /*
* The switch is now done, so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so
@@ -626,6 +586,102 @@ out_abort:
}
/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ struct log *log = mp->m_log;
+ int log_flags = 0;
+ int push = 0;
+
+ if (flags & XFS_TRANS_RELEASE_LOG_RES)
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ xlog_cil_free_logvec(log_vector);
+ return XFS_ERROR(EIO);
+ }
+
+ /*
+ * do all the hard work of formatting items (including memory
+ * allocation) outside the CIL context lock. This prevents stalling CIL
+ * pushes when we are low on memory and a transaction commit spends a
+ * lot of time in memory reclaim.
+ */
+ xlog_cil_format_items(log, log_vector);
+
+ /* lock out background commit */
+ down_read(&log->l_cilp->xc_ctx_lock);
+ xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+
+ /* check we didn't blow the reservation */
+ if (tp->t_ticket->t_curr_res < 0)
+ xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy)) {
+ spin_lock(&log->l_cilp->xc_cil_lock);
+ list_splice_init(&tp->t_busy,
+ &log->l_cilp->xc_ctx->busy_extents);
+ spin_unlock(&log->l_cilp->xc_cil_lock);
+ }
+
+ tp->t_commit_lsn = *commit_lsn;
+ xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_trans_unreserve_and_mod_sb(tp);
+
+ /*
+ * Once all the items of the transaction have been copied to the CIL,
+ * the items can be unlocked and freed.
+ *
+ * This needs to be done before we drop the CIL context lock because we
+ * have to update state in the log items and unlock them before they go
+ * to disk. If we don't, then the CIL checkpoint can race with us and
+ * we can run checkpoint completion before we've updated and unlocked
+ * the log items. This affects (at least) processing of stale buffers,
+ * inodes and EFIs.
+ */
+ xfs_trans_free_items(tp, *commit_lsn, 0);
+
+ /* check for background commit before unlock */
+ if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+ push = 1;
+
+ up_read(&log->l_cilp->xc_ctx_lock);
+
+ /*
+ * We need to push CIL every so often so we don't cache more than we
+ * can fit in the log. The limit really is that a checkpoint can't be
+ * more than half the log (the current checkpoint is not allowed to
+ * overwrite the previous checkpoint), but commit latency and memory
+ * usage limit this to a smaller size in most cases.
+ */
+ if (push)
+ xlog_cil_push(log, 0);
+ return 0;
+}
+
+/*
* Conditionally push the CIL based on the sequence passed in.
*
* We only need to push if we haven't already pushed the sequence
@@ -639,39 +695,34 @@ out_abort:
* commit lsn is there. It'll be empty, so this is broken for now.
*/
xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
struct log *log,
- xfs_lsn_t push_seq)
+ xfs_lsn_t sequence)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx;
xfs_lsn_t commit_lsn = NULLCOMMITLSN;
-restart:
- down_write(&cil->xc_ctx_lock);
- ASSERT(push_seq <= cil->xc_ctx->sequence);
-
- /* check to see if we need to force out the current context */
- if (push_seq == cil->xc_ctx->sequence) {
- up_write(&cil->xc_ctx_lock);
- xlog_cil_push(log, 1);
- goto restart;
- }
+ ASSERT(sequence <= cil->xc_current_sequence);
+
+ /*
+ * check to see if we need to force out the current context.
+ * xlog_cil_push() handles racing pushes for the same sequence,
+ * so no need to deal with it here.
+ */
+ if (sequence == cil->xc_current_sequence)
+ xlog_cil_push(log, sequence);
/*
* See if we can find a previous sequence still committing.
- * We can drop the flush lock as soon as we have the cil lock
- * because we are now only comparing contexts protected by
- * the cil lock.
- *
* We need to wait for all previous sequence commits to complete
* before allowing the force of push_seq to go ahead. Hence block
* on commits for those as well.
*/
+restart:
spin_lock(&cil->xc_cil_lock);
- up_write(&cil->xc_ctx_lock);
list_for_each_entry(ctx, &cil->xc_committing, committing) {
- if (ctx->sequence > push_seq)
+ if (ctx->sequence > sequence)
continue;
if (!ctx->commit_lsn) {
/*
@@ -681,7 +732,7 @@ restart:
sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
goto restart;
}
- if (ctx->sequence != push_seq)
+ if (ctx->sequence != sequence)
continue;
/* found it! */
commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965..ced52b98b32 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,6 +422,7 @@ struct xfs_cil {
struct rw_semaphore xc_ctx_lock;
struct list_head xc_committing;
sv_t xc_commit_wait;
+ xfs_lsn_t xc_current_sequence;
};
/*
@@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log);
void xlog_cil_init_post_recovery(struct log *log);
void xlog_cil_destroy(struct log *log);
-int xlog_cil_push(struct log *log, int push_now);
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct log *log)
+{
+ xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
/*
* Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c75..1c47edaea0d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
* Unlock all of the items of a transaction and free all the descriptors
* of that transaction.
*/
-STATIC void
+void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
return error;
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-
- /* xfs_trans_free_items() unlocks them first */
- xfs_trans_free_items(tp, *commit_lsn, 0);
xfs_trans_free(tp);
return 0;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7..62da86c90de 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
-
+void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+ int flags);
void xfs_trans_item_committed(struct xfs_log_item *lip,
xfs_lsn_t commit_lsn, int aborted);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917..4c7c7bfb2b2 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
e = allocatesize_fsb;
}
+ /*
+ * The transaction reservation is limited to a 32-bit block
+ * count, hence we need to limit the number of blocks we are
+ * trying to reserve to avoid an overflow. We can't allocate
+ * more than @nimaps extents, and an extent is limited on disk
+ * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ */
+ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
if (unlikely(rt)) {
- resrtextents = qblocks = (uint)(e - s);
+ resrtextents = qblocks = resblks;
resrtextents /= mp->m_sb.sb_rextsize;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
quota_flag = XFS_QMOPT_RES_RTBLKS;
} else {
resrtextents = 0;
- resblks = qblocks = \
- XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
+ resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
quota_flag = XFS_QMOPT_RES_REGBLKS;
}