summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2005-10-31 10:51:57 -0800
committerTony Luck <tony.luck@intel.com>2005-10-31 10:51:57 -0800
commitc7fb577e2a6cb04732541f2dc402bd46747f7558 (patch)
treedf3b1a1922ed13bfbcc45d08650c38beeb1a7bd1 /fs
parent9cec58dc138d6fcad9f447a19c8ff69f6540e667 (diff)
parent581c1b14394aee60aff46ea67d05483261ed6527 (diff)
manual update from upstream:
Applied Al's change 06a544971fad0992fe8b92c5647538d573089dd4 to new location of swiotlb.c Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/aio.c6
-rw-r--r--fs/attr.c3
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/binfmt_som.c1
-rw-r--r--fs/bio.c4
-rw-r--r--fs/buffer.c25
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/compat.c1
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dquot.c4
-rw-r--r--fs/exec.c68
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext3/balloc.c7
-rw-r--r--fs/ext3/bitmap.c2
-rw-r--r--fs/ext3/bitmap.h8
-rw-r--r--fs/ext3/ialloc.c3
-rw-r--r--fs/ext3/inode.c13
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/namei.h8
-rw-r--r--fs/ext3/resize.c10
-rw-r--r--fs/ext3/super.c30
-rw-r--r--fs/ext3/xattr.c8
-rw-r--r--fs/fat/dir.c230
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/filesystems.c1
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/hugetlbfs/inode.c206
-rw-r--r--fs/inode.c3
-rw-r--r--fs/inotify.c1
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jffs2/background.c1
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/jfs/jfs_dmap.c20
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_metapage.c22
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c18
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/lockd/host.c4
-rw-r--r--fs/locks.c48
-rw-r--r--fs/mbcache.c6
-rw-r--r--fs/msdos/namei.c14
-rw-r--r--fs/namei.c101
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/delegation.h16
-rw-r--r--fs/nfs/dir.c67
-rw-r--r--fs/nfs/file.c31
-rw-r--r--fs/nfs/inode.c202
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c92
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4_fs.h53
-rw-r--r--fs/nfs/nfs4proc.c735
-rw-r--r--fs/nfs/nfs4state.c181
-rw-r--r--fs/nfs/nfs4xdr.c305
-rw-r--r--fs/nfs/proc.c44
-rw-r--r--fs/nfs/read.c1
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/ntfs/ChangeLog85
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c832
-rw-r--r--fs/ntfs/attrib.c983
-rw-r--r--fs/ntfs/attrib.h10
-rw-r--r--fs/ntfs/file.c2255
-rw-r--r--fs/ntfs/inode.c514
-rw-r--r--fs/ntfs/layout.h31
-rw-r--r--fs/ntfs/lcnalloc.c56
-rw-r--r--fs/ntfs/lcnalloc.h43
-rw-r--r--fs/ntfs/malloc.h3
-rw-r--r--fs/ntfs/mft.c26
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/open.c79
-rw-r--r--fs/partitions/check.c29
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/proc_misc.c8
-rw-r--r--fs/proc/task_mmu.c51
-rw-r--r--fs/reiserfs/fix_node.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/super.c25
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/super.c2
-rw-r--r--fs/vfat/namei.c20
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/linux-2.6/kmem.c22
-rw-r--r--fs/xfs/linux-2.6/kmem.h18
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c15
105 files changed, 5730 insertions, 2172 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 48f5422cb19..01a295232f7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -810,7 +810,7 @@ config TMPFS
config HUGETLBFS
bool "HugeTLB file system support"
- depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
+ depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
config HUGETLB_PAGE
def_bool HUGETLBFS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 434c19d076a..175b2e8177c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -57,7 +57,7 @@ config BINFMT_SHARED_FLAT
config BINFMT_AOUT
tristate "Kernel support for a.out and ECOFF binaries"
- depends on (X86 && !X86_64) || ALPHA || ARM || M68K || SPARC32
+ depends on X86_32 || ALPHA || ARM || M68K || SPARC32
---help---
A.out (Assembler.OUTput) is a set of formats for libraries and
executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 23c12512802..4975c9c193d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -29,7 +29,7 @@ static int afs_file_release(struct inode *inode, struct file *file);
static int afs_file_readpage(struct file *file, struct page *page);
static int afs_file_invalidatepage(struct page *page, unsigned long offset);
-static int afs_file_releasepage(struct page *page, int gfp_flags);
+static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
static ssize_t afs_file_write(struct file *file, const char __user *buf,
size_t size, loff_t *off);
@@ -279,7 +279,7 @@ static int afs_file_invalidatepage(struct page *page, unsigned long offset)
/*
* release a page and cleanup its private data
*/
-static int afs_file_releasepage(struct page *page, int gfp_flags)
+static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
{
struct cachefs_page *pageio;
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, int gfp_flags)
cachefs_uncache_page(vnode->cache, page);
#endif
- pageio = (struct cachefs_page *) page->private;
- page->private = 0;
+ pageio = (struct cachefs_page *) page_private(page);
+ set_page_private(page, 0);
ClearPagePrivate(page);
if (pageio)
diff --git a/fs/aio.c b/fs/aio.c
index 9fe7216457d..edfca5b7553 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1397,6 +1397,9 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
kiocb->ki_left)))
break;
+ ret = security_file_permission(file, MAY_READ);
+ if (unlikely(ret))
+ break;
ret = -EINVAL;
if (file->f_op->aio_read)
kiocb->ki_retry = aio_pread;
@@ -1409,6 +1412,9 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
kiocb->ki_left)))
break;
+ ret = security_file_permission(file, MAY_WRITE);
+ if (unlikely(ret))
+ break;
ret = -EINVAL;
if (file->f_op->aio_write)
kiocb->ki_retry = aio_pwrite;
diff --git a/fs/attr.c b/fs/attr.c
index b1796fb9e52..67bcd9b14ea 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -117,9 +117,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
- if (!inode)
- BUG();
-
mode = inode->i_mode;
now = current_fs_time(inode->i_sb);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf01..72011826f0c 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
- set_mm_counter(current->mm, rss, 0);
current->mm->mmap = NULL;
compute_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e58..6fa6adc4097 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
- set_mm_counter(current->mm, rss, 0);
current->mm->free_area_cache = current->mm->mmap_base;
current->mm->cached_hole_size = 0;
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
@@ -1503,9 +1502,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
fill_psinfo(psinfo, current->group_leader, current->mm);
fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
- fill_note(notes +2, "CORE", NT_TASKSTRUCT, sizeof(*current), current);
-
- numnote = 3;
+ numnote = 2;
auxv = (elf_addr_t *) current->mm->saved_auxv;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f5..dda87c4c82a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
&interp_params,
&current->mm->start_stack,
&current->mm->start_brk);
-#endif
-
- /* do this so that we can load the interpreter, if need be
- * - we will change some of these later
- */
- set_mm_counter(current->mm, rss, 0);
-#ifdef CONFIG_MMU
retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
if (retval < 0) {
send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107b..9d6625829b9 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
- set_mm_counter(current->mm, rss, 0);
}
if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2b..00a91dc25d1 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
create_som_tables(bprm);
current->mm->start_stack = bprm->p;
- set_mm_counter(current->mm, rss, 0);
#if 0
printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/fs/bio.c b/fs/bio.c
index 7d81a93afd4..460554b07ff 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -778,7 +778,7 @@ static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err)
static struct bio *__bio_map_kern(request_queue_t *q, void *data,
- unsigned int len, unsigned int gfp_mask)
+ unsigned int len, gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -825,7 +825,7 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data,
* device. Returns an error pointer in case of error.
*/
struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len,
- unsigned int gfp_mask)
+ gfp_t gfp_mask)
{
struct bio *bio;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1216c0d3c8c..35fa34977e8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
__clear_page_buffers(struct page *page)
{
ClearPagePrivate(page);
- page->private = 0;
+ set_page_private(page, 0);
page_cache_release(page);
}
@@ -502,7 +502,7 @@ static void free_more_memory(void)
yield();
for_each_pgdat(pgdat) {
- zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
+ zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
if (*zones)
try_to_free_pages(zones, GFP_NOFS);
}
@@ -1478,8 +1478,10 @@ EXPORT_SYMBOL(__getblk);
void __breadahead(struct block_device *bdev, sector_t block, int size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
- ll_rw_block(READA, 1, &bh);
- brelse(bh);
+ if (likely(bh)) {
+ ll_rw_block(READA, 1, &bh);
+ brelse(bh);
+ }
}
EXPORT_SYMBOL(__breadahead);
@@ -1497,7 +1499,7 @@ __bread(struct block_device *bdev, sector_t block, int size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
- if (!buffer_uptodate(bh))
+ if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
@@ -1571,7 +1573,7 @@ static inline void discard_buffer(struct buffer_head * bh)
*
* NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
-int try_to_release_page(struct page *page, int gfp_mask)
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
struct address_space * const mapping = page->mapping;
@@ -1637,6 +1639,15 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);
+int do_invalidatepage(struct page *page, unsigned long offset)
+{
+ int (*invalidatepage)(struct page *, unsigned long);
+ invalidatepage = page->mapping->a_ops->invalidatepage;
+ if (invalidatepage == NULL)
+ invalidatepage = block_invalidatepage;
+ return (*invalidatepage)(page, offset);
+}
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2696,7 +2707,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0);
+ do_invalidatepage(page, 0);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 3d1cce3653b..6a3df88accf 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -370,8 +370,8 @@ static int init_coda_psdev(void)
}
devfs_mk_dir ("coda");
for (i = 0; i < MAX_CODADEVS; i++) {
- class_device_create(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR,i),
- NULL, "cfs%d", i);
+ class_device_create(coda_psdev_class, NULL,
+ MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i),
S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i);
if (err)
diff --git a/fs/compat.c b/fs/compat.c
index a719e158e00..8e71cdbecc7 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
/* execve success */
security_bprm_free(bprm);
acct_update_integrals(current);
- update_mem_hiwater(current);
kfree(bprm);
return retval;
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e28a74203f3..43dbcb0b21e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -3046,10 +3046,15 @@ HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
/* Serial */
HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
+#ifdef TIOCGLTC
+COMPATIBLE_IOCTL(TIOCGLTC)
+COMPATIBLE_IOCTL(TIOCSLTC)
+#endif
/* Usbdevfs */
HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control)
HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk)
HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
+COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
/* i2c */
HANDLE_IOCTL(I2C_FUNCS, w_long)
HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
diff --git a/fs/dcache.c b/fs/dcache.c
index fb10386c59b..e90512ed35a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -689,7 +689,7 @@ void shrink_dcache_anon(struct hlist_head *head)
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(int nr, unsigned int gfp_mask)
+static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
{
if (nr) {
if (!(gfp_mask & __GFP_FS))
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc99..3931e7f1e6b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
up_read(&current->mm->mmap_sem);
if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+ struct page *page = ZERO_PAGE(dio->curr_user_address);
/*
* A memory fault, but the filesystem has some outstanding
* mapped blocks. We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
*/
if (dio->page_errors == 0)
dio->page_errors = ret;
- dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+ page_cache_get(page);
+ dio->pages[0] = page;
dio->head = 0;
dio->tail = 1;
ret = 0;
diff --git a/fs/dquot.c b/fs/dquot.c
index b9732335bcd..ea7644227a6 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -500,7 +500,7 @@ static void prune_dqcache(int count)
* more memory
*/
-static int shrink_dqcache_memory(int nr, unsigned int gfp_mask)
+static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
{
if (nr) {
spin_lock(&dq_list_lock);
@@ -662,7 +662,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
restart:
file_list_lock();
list_for_each(p, &sb->s_files) {
- struct file *filp = list_entry(p, struct file, f_list);
+ struct file *filp = list_entry(p, struct file, f_u.fu_list);
struct inode *inode = filp->f_dentry->d_inode;
if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
struct dentry *dentry = dget(filp->f_dentry);
diff --git a/fs/exec.c b/fs/exec.c
index a04a575ad43..10d493fea7c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -126,8 +126,7 @@ asmlinkage long sys_uselib(const char __user * library)
struct nameidata nd;
int error;
- nd.intent.open.flags = FMODE_READ;
- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ);
if (error)
goto out;
@@ -139,7 +138,7 @@ asmlinkage long sys_uselib(const char __user * library)
if (error)
goto exit;
- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
+ file = nameidata_to_filp(&nd, O_RDONLY);
error = PTR_ERR(file);
if (IS_ERR(file))
goto out;
@@ -167,6 +166,7 @@ asmlinkage long sys_uselib(const char __user * library)
out:
return error;
exit:
+ release_open_intent(&nd);
path_release(&nd);
goto out;
}
@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma,
pud_t * pud;
pmd_t * pmd;
pte_t * pte;
+ spinlock_t *ptl;
if (unlikely(anon_vma_prepare(vma)))
- goto out_sig;
+ goto out;
flush_dcache_page(page);
pgd = pgd_offset(mm, address);
-
- spin_lock(&mm->page_table_lock);
pud = pud_alloc(mm, pgd, address);
if (!pud)
goto out;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
goto out;
- pte = pte_alloc_map(mm, pmd, address);
+ pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
if (!pte)
goto out;
if (!pte_none(*pte)) {
- pte_unmap(pte);
+ pte_unmap_unlock(pte, ptl);
goto out;
}
- inc_mm_counter(mm, rss);
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
page_add_anon_rmap(page, vma, address);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
/* no need for flush_tlb */
return;
out:
- spin_unlock(&mm->page_table_lock);
-out_sig:
__free_page(page);
force_sig(SIGKILL, current);
}
@@ -490,8 +486,7 @@ struct file *open_exec(const char *name)
int err;
struct file *file;
- nd.intent.open.flags = FMODE_READ;
- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ);
file = ERR_PTR(err);
if (!err) {
@@ -504,7 +499,7 @@ struct file *open_exec(const char *name)
err = -EACCES;
file = ERR_PTR(err);
if (!err) {
- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
+ file = nameidata_to_filp(&nd, O_RDONLY);
if (!IS_ERR(file)) {
err = deny_write_access(file);
if (err) {
@@ -516,6 +511,7 @@ out:
return file;
}
}
+ release_open_intent(&nd);
path_release(&nd);
}
goto out;
@@ -634,10 +630,9 @@ static inline int de_thread(struct task_struct *tsk)
/*
* Account for the thread group leader hanging around:
*/
- count = 2;
- if (thread_group_leader(current))
- count = 1;
- else {
+ count = 1;
+ if (!thread_group_leader(current)) {
+ count = 2;
/*
* The SIGALRM timer survives the exec, but needs to point
* at us as the new group leader now. We have a race with
@@ -646,8 +641,10 @@ static inline int de_thread(struct task_struct *tsk)
* before we can safely let the old group leader die.
*/
sig->real_timer.data = (unsigned long)current;
+ spin_unlock_irq(lock);
if (del_timer_sync(&sig->real_timer))
add_timer(&sig->real_timer);
+ spin_lock_irq(lock);
}
while (atomic_read(&sig->count) > count) {
sig->group_exit_task = current;
@@ -659,7 +656,6 @@ static inline int de_thread(struct task_struct *tsk)
}
sig->group_exit_task = NULL;
sig->notify_count = 0;
- sig->real_timer.data = (unsigned long)current;
spin_unlock_irq(lock);
/*
@@ -1207,7 +1203,6 @@ int do_execve(char * filename,
/* execve success */
security_bprm_free(bprm);
acct_update_integrals(current);
- update_mem_hiwater(current);
kfree(bprm);
return retval;
}
@@ -1422,19 +1417,16 @@ static void zap_threads (struct mm_struct *mm)
static void coredump_wait(struct mm_struct *mm)
{
DECLARE_COMPLETION(startup_done);
+ int core_waiters;
- mm->core_waiters++; /* let other threads block */
mm->core_startup_done = &startup_done;
- /* give other threads a chance to run: */
- yield();
-
zap_threads(mm);
- if (--mm->core_waiters) {
- up_write(&mm->mmap_sem);
+ core_waiters = mm->core_waiters;
+ up_write(&mm->mmap_sem);
+
+ if (core_waiters)
wait_for_completion(&startup_done);
- } else
- up_write(&mm->mmap_sem);
BUG_ON(mm->core_waiters);
}
@@ -1468,11 +1460,21 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
current->fsuid = 0; /* Dump root private */
}
mm->dumpable = 0;
- init_completion(&mm->core_done);
+
+ retval = -EAGAIN;
spin_lock_irq(&current->sighand->siglock);
- current->signal->flags = SIGNAL_GROUP_EXIT;
- current->signal->group_exit_code = exit_code;
+ if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
+ current->signal->flags = SIGNAL_GROUP_EXIT;
+ current->signal->group_exit_code = exit_code;
+ retval = 0;
+ }
spin_unlock_irq(&current->sighand->siglock);
+ if (retval) {
+ up_write(&mm->mmap_sem);
+ goto fail;
+ }
+
+ init_completion(&mm->core_done);
coredump_wait(mm);
/*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fdba4d1d3c6..e7d3f0522d0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -440,6 +440,10 @@ static int ext2_alloc_branch(struct inode *inode,
* the pointer to new one, then send parent to disk.
*/
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ err = -EIO;
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
branch[n].bh = bh;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0213db4911a..7992d21e0e0 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,8 @@
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
+#include "bitmap.h"
+
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
@@ -1010,7 +1012,7 @@ retry:
* allocation within the reservation window.
*
* This will avoid keeping on searching the reservation list again and
- * again when someboday is looking for a free block (without
+ * again when somebody is looking for a free block (without
* reservation), and there are lots of free blocks, but they are all
* being reserved.
*
@@ -1416,12 +1418,12 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;
- lock_super(sb);
es = EXT3_SB(sb)->s_es;
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
+ smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext3_get_group_desc(sb, i, NULL);
if (!gdp)
@@ -1440,7 +1442,6 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
brelse(bitmap_bh);
printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
- unlock_super(sb);
return bitmap_count;
#else
desc_count = 0;
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 6c419b9ab0e..5b4ba3e246e 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -8,7 +8,7 @@
*/
#include <linux/buffer_head.h>
-
+#include "bitmap.h"
static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
diff --git a/fs/ext3/bitmap.h b/fs/ext3/bitmap.h
new file mode 100644
index 00000000000..6ee503a6bb4
--- /dev/null
+++ b/fs/ext3/bitmap.h
@@ -0,0 +1,8 @@
+/* linux/fs/ext3/bitmap.c
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ * Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern unsigned long ext3_count_free (struct buffer_head *, unsigned int );
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 6549945f9ac..df3f517c54a 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -26,6 +26,7 @@
#include <asm/byteorder.h>
+#include "bitmap.h"
#include "xattr.h"
#include "acl.h"
@@ -704,7 +705,6 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
unsigned long bitmap_count, x;
struct buffer_head *bitmap_bh = NULL;
- lock_super (sb);
es = EXT3_SB(sb)->s_es;
desc_count = 0;
bitmap_count = 0;
@@ -727,7 +727,6 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
brelse(bitmap_bh);
printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
- unlock_super(sb);
return desc_count;
#else
desc_count = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b5177c90d6f..5d9b00e2883 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -491,7 +491,7 @@ static unsigned long ext3_find_goal(struct inode *inode, long block,
* the same format as ext3_get_branch() would do. We are calling it after
* we had read the existing part of chain and partial points to the last
* triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext3_get_block(), excpet that in one
+ * picture as after the successful ext3_get_block(), except that in one
* place chain is disconnected - *branch->p is still zero (we did not
* set the last link), but branch->key contains the number that should
* be placed into *branch->p to fill that gap.
@@ -523,7 +523,6 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
if (!nr)
break;
branch[n].key = cpu_to_le32(nr);
- keys = n+1;
/*
* Get buffer_head for parent block, zero it out
@@ -531,6 +530,9 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh)
+ break;
+ keys = n+1;
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -864,6 +866,10 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
if (!*errp && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+ if (!bh) {
+ *errp = -EIO;
+ goto err;
+ }
if (buffer_new(&dummy)) {
J_ASSERT(create != 0);
J_ASSERT(handle != 0);
@@ -896,6 +902,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
}
return bh;
}
+err:
return NULL;
}
@@ -1434,7 +1441,7 @@ static int ext3_invalidatepage(struct page *page, unsigned long offset)
return journal_invalidatepage(journal, page, offset);
}
-static int ext3_releasepage(struct page *page, int wait)
+static int ext3_releasepage(struct page *page, gfp_t wait)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 50378d8ff84..b3c690a3b54 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,6 +36,8 @@
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/smp_lock.h>
+
+#include "namei.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
new file mode 100644
index 00000000000..f2ce2b0065c
--- /dev/null
+++ b/fs/ext3/namei.h
@@ -0,0 +1,8 @@
+/* linux/fs/ext3/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ * Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern struct dentry *ext3_get_parent(struct dentry *child);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 57f79106267..1be78b4b4de 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -118,6 +118,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
+ if (!bh)
+ return ERR_PTR(-EIO);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -202,6 +204,10 @@ static int setup_new_group_blocks(struct super_block *sb,
ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
gdb = sb_getblk(sb, block);
+ if (!gdb) {
+ err = -EIO;
+ goto exit_bh;
+ }
if ((err = ext3_journal_get_write_access(handle, gdb))) {
brelse(gdb);
goto exit_bh;
@@ -643,6 +649,10 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
+ if (!bh) {
+ err = -EIO;
+ break;
+ }
ext3_debug("update metadata backup %#04lx\n",
(unsigned long)bh->b_blocknr);
if ((err = ext3_journal_get_write_access(handle, bh)))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e24ceb019f..f594989ccb7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,9 +36,12 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+
#include <asm/uaccess.h>
+
#include "xattr.h"
#include "acl.h"
+#include "namei.h"
static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -510,19 +513,11 @@ static void ext3_clear_inode(struct inode *inode)
kfree(rsv);
}
-static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
{
- struct super_block *sb = vfs->mnt_sb;
+#if defined(CONFIG_QUOTA)
struct ext3_sb_info *sbi = EXT3_SB(sb);
- if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
-#if defined(CONFIG_QUOTA)
if (sbi->s_jquota_fmt)
seq_printf(seq, ",jqfmt=%s",
(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
@@ -539,6 +534,20 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
seq_puts(seq, ",grpquota");
#endif
+}
+
+static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct super_block *sb = vfs->mnt_sb;
+
+ if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ seq_puts(seq, ",data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
+ seq_puts(seq, ",data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
+ seq_puts(seq, ",data=writeback");
+
+ ext3_show_quota_options(seq, sb);
return 0;
}
@@ -609,7 +618,6 @@ static struct super_operations ext3_sops = {
#endif
};
-struct dentry *ext3_get_parent(struct dentry *child);
static struct export_operations ext3_export_ops = {
.get_parent = ext3_get_parent,
};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 269c7b92db9..430de9f63be 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -210,7 +210,7 @@ ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
return cmp ? -ENODATA : 0;
}
-int
+static int
ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
@@ -354,7 +354,7 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
return buffer_size - rest;
}
-int
+static int
ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
{
struct buffer_head *bh = NULL;
@@ -626,7 +626,7 @@ struct ext3_xattr_block_find {
struct buffer_head *bh;
};
-int
+static int
ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
struct ext3_xattr_block_find *bs)
{
@@ -859,7 +859,7 @@ struct ext3_xattr_ibody_find {
struct ext3_iloc iloc;
};
-int
+static int
ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
struct ext3_xattr_ibody_find *is)
{
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 895049b2ac9..ba824964b9b 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -222,6 +222,80 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
return len;
}
+enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
+
+/**
+ * fat_parse_long - Parse extended directory entry.
+ *
+ * This function returns zero on success, negative value on error, or one of
+ * the following:
+ *
+ * %PARSE_INVALID - Directory entry is invalid.
+ * %PARSE_NOT_LONGNAME - Directory entry does not contain longname.
+ * %PARSE_EOF - Directory has no more entries.
+ */
+static int fat_parse_long(struct inode *dir, loff_t *pos,
+ struct buffer_head **bh, struct msdos_dir_entry **de,
+ wchar_t **unicode, unsigned char *nr_slots)
+{
+ struct msdos_dir_slot *ds;
+ unsigned char id, slot, slots, alias_checksum;
+
+ if (!*unicode) {
+ *unicode = (wchar_t *)__get_free_page(GFP_KERNEL);
+ if (!*unicode) {
+ brelse(*bh);
+ return -ENOMEM;
+ }
+ }
+parse_long:
+ slots = 0;
+ ds = (struct msdos_dir_slot *)*de;
+ id = ds->id;
+ if (!(id & 0x40))
+ return PARSE_INVALID;
+ slots = id & ~0x40;
+ if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
+ return PARSE_INVALID;
+ *nr_slots = slots;
+ alias_checksum = ds->alias_checksum;
+
+ slot = slots;
+ while (1) {
+ int offset;
+
+ slot--;
+ offset = slot * 13;
+ fat16_towchar(*unicode + offset, ds->name0_4, 5);
+ fat16_towchar(*unicode + offset + 5, ds->name5_10, 6);
+ fat16_towchar(*unicode + offset + 11, ds->name11_12, 2);
+
+ if (ds->id & 0x40)
+ (*unicode)[offset + 13] = 0;
+ if (fat_get_entry(dir, pos, bh, de) < 0)
+ return PARSE_EOF;
+ if (slot == 0)
+ break;
+ ds = (struct msdos_dir_slot *)*de;
+ if (ds->attr != ATTR_EXT)
+ return PARSE_NOT_LONGNAME;
+ if ((ds->id & ~0x40) != slot)
+ goto parse_long;
+ if (ds->alias_checksum != alias_checksum)
+ goto parse_long;
+ }
+ if ((*de)->name[0] == DELETED_FLAG)
+ return PARSE_INVALID;
+ if ((*de)->attr == ATTR_EXT)
+ goto parse_long;
+ if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME))
+ return PARSE_INVALID;
+ if (fat_checksum((*de)->name) != alias_checksum)
+ *nr_slots = 0;
+
+ return 0;
+}
+
/*
* Return values: negative -> error, 0 -> not found, positive -> found,
* value is the total amount of slots, including the shortname entry.
@@ -259,68 +333,16 @@ parse_record:
if (de->attr != ATTR_EXT && IS_FREE(de->name))
continue;
if (de->attr == ATTR_EXT) {
- struct msdos_dir_slot *ds;
- unsigned char id;
- unsigned char slot;
- unsigned char slots;
- unsigned char sum;
- unsigned char alias_checksum;
-
- if (!unicode) {
- unicode = (wchar_t *)
- __get_free_page(GFP_KERNEL);
- if (!unicode) {
- brelse(bh);
- return -ENOMEM;
- }
- }
-parse_long:
- slots = 0;
- ds = (struct msdos_dir_slot *) de;
- id = ds->id;
- if (!(id & 0x40))
- continue;
- slots = id & ~0x40;
- if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
- continue;
- nr_slots = slots;
- alias_checksum = ds->alias_checksum;
-
- slot = slots;
- while (1) {
- int offset;
-
- slot--;
- offset = slot * 13;
- fat16_towchar(unicode + offset, ds->name0_4, 5);
- fat16_towchar(unicode + offset + 5, ds->name5_10, 6);
- fat16_towchar(unicode + offset + 11, ds->name11_12, 2);
-
- if (ds->id & 0x40) {
- unicode[offset + 13] = 0;
- }
- if (fat_get_entry(inode, &cpos, &bh, &de) < 0)
- goto EODir;
- if (slot == 0)
- break;
- ds = (struct msdos_dir_slot *) de;
- if (ds->attr != ATTR_EXT)
- goto parse_record;
- if ((ds->id & ~0x40) != slot)
- goto parse_long;
- if (ds->alias_checksum != alias_checksum)
- goto parse_long;
- }
- if (de->name[0] == DELETED_FLAG)
- continue;
- if (de->attr == ATTR_EXT)
- goto parse_long;
- if (IS_FREE(de->name) || (de->attr & ATTR_VOLUME))
+ int status = fat_parse_long(inode, &cpos, &bh, &de,
+ &unicode, &nr_slots);
+ if (status < 0)
+ return status;
+ else if (status == PARSE_INVALID)
continue;
- for (sum = 0, i = 0; i < 11; i++)
- sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + de->name[i];
- if (sum != alias_checksum)
- nr_slots = 0;
+ else if (status == PARSE_NOT_LONGNAME)
+ goto parse_record;
+ else if (status == PARSE_EOF)
+ goto EODir;
}
memcpy(work, de->name, sizeof(de->name));
@@ -408,8 +430,8 @@ struct fat_ioctl_filldir_callback {
int short_len;
};
-static int fat_readdirx(struct inode *inode, struct file *filp, void *dirent,
- filldir_t filldir, int short_only, int both)
+static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
+ filldir_t filldir, int short_only, int both)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -458,9 +480,10 @@ static int fat_readdirx(struct inode *inode, struct file *filp, void *dirent,
bh = NULL;
GetNew:
- long_slots = 0;
if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
goto EODir;
+parse_record:
+ long_slots = 0;
/* Check for long filename entry */
if (isvfat) {
if (de->name[0] == DELETED_FLAG)
@@ -475,69 +498,18 @@ GetNew:
}
if (isvfat && de->attr == ATTR_EXT) {
- struct msdos_dir_slot *ds;
- unsigned char id;
- unsigned char slot;
- unsigned char slots;
- unsigned char sum;
- unsigned char alias_checksum;
-
- if (!unicode) {
- unicode = (wchar_t *)__get_free_page(GFP_KERNEL);
- if (!unicode) {
- filp->f_pos = cpos;
- brelse(bh);
- ret = -ENOMEM;
- goto out;
- }
- }
-ParseLong:
- slots = 0;
- ds = (struct msdos_dir_slot *) de;
- id = ds->id;
- if (!(id & 0x40))
- goto RecEnd;
- slots = id & ~0x40;
- if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
+ int status = fat_parse_long(inode, &cpos, &bh, &de,
+ &unicode, &long_slots);
+ if (status < 0) {
+ filp->f_pos = cpos;
+ ret = status;
+ goto out;
+ } else if (status == PARSE_INVALID)
goto RecEnd;
- long_slots = slots;
- alias_checksum = ds->alias_checksum;
-
- slot = slots;
- while (1) {
- int offset;
-
- slot--;
- offset = slot * 13;
- fat16_towchar(unicode + offset, ds->name0_4, 5);
- fat16_towchar(unicode + offset + 5, ds->name5_10, 6);
- fat16_towchar(unicode + offset + 11, ds->name11_12, 2);
-
- if (ds->id & 0x40) {
- unicode[offset + 13] = 0;
- }
- if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
- goto EODir;
- if (slot == 0)
- break;
- ds = (struct msdos_dir_slot *) de;
- if (ds->attr != ATTR_EXT)
- goto RecEnd; /* XXX */
- if ((ds->id & ~0x40) != slot)
- goto ParseLong;
- if (ds->alias_checksum != alias_checksum)
- goto ParseLong;
- }
- if (de->name[0] == DELETED_FLAG)
- goto RecEnd;
- if (de->attr == ATTR_EXT)
- goto ParseLong;
- if (IS_FREE(de->name) || (de->attr & ATTR_VOLUME))
- goto RecEnd;
- for (sum = 0, i = 0; i < 11; i++)
- sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + de->name[i];
- if (sum != alias_checksum)
- long_slots = 0;
+ else if (status == PARSE_NOT_LONGNAME)
+ goto parse_record;
+ else if (status == PARSE_EOF)
+ goto EODir;
}
if (sbi->options.dotsOK) {
@@ -671,7 +643,7 @@ out:
static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct inode *inode = filp->f_dentry->d_inode;
- return fat_readdirx(inode, filp, dirent, filldir, 0, 0);
+ return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
}
static int fat_ioctl_filldir(void *__buf, const char *name, int name_len,
@@ -760,8 +732,8 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
down(&inode->i_sem);
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
- ret = fat_readdirx(inode, filp, &buf, fat_ioctl_filldir,
- short_only, both);
+ ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
+ short_only, both);
}
up(&inode->i_sem);
if (ret >= 0)
diff --git a/fs/file_table.c b/fs/file_table.c
index 86ec8ae985b..4dc20554654 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -56,13 +56,13 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
static inline void file_free_rcu(struct rcu_head *head)
{
- struct file *f = container_of(head, struct file, f_rcuhead);
+ struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
/* Find an unused file structure and return a pointer to it.
@@ -95,7 +95,7 @@ struct file *get_empty_filp(void)
f->f_gid = current->fsgid;
rwlock_init(&f->f_owner.lock);
/* f->f_version: 0 */
- INIT_LIST_HEAD(&f->f_list);
+ INIT_LIST_HEAD(&f->f_u.fu_list);
return f;
over:
@@ -225,15 +225,15 @@ void file_move(struct file *file, struct list_head *list)
if (!list)
return;
file_list_lock();
- list_move(&file->f_list, list);
+ list_move(&file->f_u.fu_list, list);
file_list_unlock();
}
void file_kill(struct file *file)
{
- if (!list_empty(&file->f_list)) {
+ if (!list_empty(&file->f_u.fu_list)) {
file_list_lock();
- list_del_init(&file->f_list);
+ list_del_init(&file->f_u.fu_list);
file_list_unlock();
}
}
@@ -245,7 +245,7 @@ int fs_may_remount_ro(struct super_block *sb)
/* Check that no files are currently opened for writing. */
file_list_lock();
list_for_each(p, &sb->s_files) {
- struct file *file = list_entry(p, struct file, f_list);
+ struct file *file = list_entry(p, struct file, f_u.fu_list);
struct inode *inode = file->f_dentry->d_inode;
/* File with pending delete? */
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 44082bfdfec..9f1072836c8 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -12,6 +12,7 @@
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/sched.h> /* for 'current' */
#include <asm/uaccess.h>
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e94ab398b71..ffab4783ac6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -230,7 +230,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
* The inode is clean, unused
*/
list_move(&inode->i_list, &inode_unused);
- inodes_stat.nr_unused++;
}
}
wake_up_inode(inode);
@@ -238,14 +237,20 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * Write out an inode's dirty pages. Called under inode_lock.
+ * Write out an inode's dirty pages. Called under inode_lock. Either the
+ * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * or the inode has I_WILL_FREE set (via generic_forget_inode)
*/
static int
-__writeback_single_inode(struct inode *inode,
- struct writeback_control *wbc)
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
wait_queue_head_t *wqh;
+ if (!atomic_read(&inode->i_count))
+ WARN_ON(!(inode->i_state & I_WILL_FREE));
+ else
+ WARN_ON(inode->i_state & I_WILL_FREE);
+
if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
list_move(&inode->i_list, &inode->i_sb->s_dirty);
return 0;
@@ -259,11 +264,9 @@ __writeback_single_inode(struct inode *inode,
wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
do {
- __iget(inode);
spin_unlock(&inode_lock);
__wait_on_bit(wqh, &wq, inode_wait,
TASK_UNINTERRUPTIBLE);
- iput(inode);
spin_lock(&inode_lock);
} while (inode->i_state & I_LOCK);
}
@@ -541,14 +544,15 @@ void sync_inodes(int wait)
}
/**
- * write_inode_now - write an inode to disk
- * @inode: inode to write to disk
- * @sync: whether the write should be synchronous or not
+ * write_inode_now - write an inode to disk
+ * @inode: inode to write to disk
+ * @sync: whether the write should be synchronous or not
+ *
+ * This function commits an inode to disk immediately if it is dirty. This is
+ * primarily needed by knfsd.
*
- * This function commits an inode to disk immediately if it is
- * dirty. This is primarily needed by knfsd.
+ * The caller must either have a ref on the inode or must have set I_WILL_FREE.
*/
-
int write_inode_now(struct inode *inode, int sync)
{
int ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d4c869c6d01..a6f90a6c754 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -151,9 +151,9 @@ void fuse_release_background(struct fuse_req *req)
/*
* This function is called when a request is finished. Either a reply
* has arrived or it was interrupted (and not yet sent) or some error
- * occured during communication with userspace, or the device file was
- * closed. It decreases the referece count for the request. In case
- * of a background request the referece to the stored objects are
+ * occurred during communication with userspace, or the device file was
+ * closed. It decreases the reference count for the request. In case
+ * of a background request the reference to the stored objects are
* released. The requester thread is woken up (if still waiting), and
* finally the request is either freed or put on the unused_list
*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 29f1e9f6e85..70dba721aca 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -741,13 +741,14 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (inode && S_ISDIR(inode->i_mode)) {
/* Don't allow creating an alias to a directory */
struct dentry *alias = d_find_alias(inode);
- if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+ if (alias) {
dput(alias);
iput(inode);
return ERR_PTR(-EIO);
}
}
- return d_splice_alias(inode, entry);
+ d_add(entry, inode);
+ return NULL;
}
static int fuse_setxattr(struct dentry *entry, const char *name,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24d761518d8..5cb456f572c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -349,22 +349,22 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
int isdir);
/**
- * Initialise file operations on a regular file
+ * Initialize file operations on a regular file
*/
void fuse_init_file_inode(struct inode *inode);
/**
- * Initialise inode operations on regular files and special files
+ * Initialize inode operations on regular files and special files
*/
void fuse_init_common(struct inode *inode);
/**
- * Initialise inode and file operations on a directory
+ * Initialize inode and file operations on a directory
*/
void fuse_init_dir(struct inode *inode);
/**
- * Initialise inode operations on a symlink
+ * Initialize inode operations on a symlink
*/
void fuse_init_symlink(struct inode *inode);
@@ -411,7 +411,7 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc);
/**
* Decrement reference count of a request. If count goes to zero put
- * on unused list (preallocated) or free reqest (not preallocated).
+ * on unused list (preallocated) or free request (not preallocated).
*/
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
@@ -431,7 +431,7 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
/**
- * Release inodes and file assiciated with background request
+ * Release inodes and file associated with background request
*/
void fuse_release_background(struct fuse_req *req);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f1570b9f9de..3f680c5675b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -46,7 +46,7 @@ static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfs_get_block);
}
-static int hfs_releasepage(struct page *page, int mask)
+static int hfs_releasepage(struct page *page, gfp_t mask)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d5642705f63..f205773ddfb 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -40,7 +40,7 @@ static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfsplus_get_block);
}
-static int hfsplus_releasepage(struct page *page, int mask)
+static int hfsplus_releasepage(struct page *page, gfp_t mask)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index fd0f0f050e1..452fc1fdbd3 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -50,6 +50,7 @@ static void hfsplus_read_inode(struct inode *inode)
init_MUTEX(&HFSPLUS_I(inode).extents_lock);
HFSPLUS_I(inode).flags = 0;
HFSPLUS_I(inode).rsrc_inode = NULL;
+ atomic_set(&HFSPLUS_I(inode).opencnt, 0);
if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
read_inode:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a9b6d179cb..e026c807e6b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
int sysctl_hugetlb_shm_group;
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); ++i)
+ put_page(pvec->pages[i]);
+
+ pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA. This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ int i;
+ struct pagevec pvec;
+ unsigned long start = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+ pgoff_t next = vma->vm_pgoff;
+ pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+
+ pagevec_init(&pvec, 0);
+ while (next < endpg) {
+ if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ break;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > next)
+ next = page->index;
+ if (page->index >= endpg)
+ break;
+ next++;
+ hugepages--;
+ }
+ huge_pagevec_release(&pvec);
+ }
+ return hugepages << HPAGE_SHIFT;
+}
+
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
+ unsigned long bytes;
loff_t len, vma_len;
int ret;
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
return -EINVAL;
+ bytes = huge_pages_needed(mapping, vma);
+ if (!is_hugepage_mem_enough(bytes))
+ return -ENOMEM;
+
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
down(&inode->i_sem);
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
- ret = hugetlb_prefault(mapping, vma);
- if (ret)
- goto out;
-
+ ret = 0;
+ hugetlb_prefault_arch_hook(vma->vm_mm);
if (inode->i_size < len)
inode->i_size = len;
out:
@@ -92,7 +142,7 @@ out:
}
/*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem).
*/
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
return -EINVAL;
}
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
static void truncate_huge_page(struct page *page)
{
clear_page_dirty(page);
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
static void hugetlbfs_delete_inode(struct inode *inode)
{
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
-
- hlist_del_init(&inode->i_hash);
- list_del_init(&inode->i_list);
- list_del_init(&inode->i_sb_list);
- inode->i_state |= I_FREEING;
- inodes_stat.nr_inodes--;
- spin_unlock(&inode_lock);
-
if (inode->i_data.nrpages)
truncate_hugepages(&inode->i_data, 0);
-
- security_inode_delete(inode);
-
- if (sbinfo->free_inodes >= 0) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
-
clear_inode(inode);
- destroy_inode(inode);
}
static void hugetlbfs_forget_inode(struct inode *inode)
{
- struct super_block *super_block = inode->i_sb;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
+ struct super_block *sb = inode->i_sb;
- if (hlist_unhashed(&inode->i_hash))
- goto out_truncate;
-
- if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
- list_del(&inode->i_list);
- list_add(&inode->i_list, &inode_unused);
- }
- inodes_stat.nr_unused++;
- if (!super_block || (super_block->s_flags & MS_ACTIVE)) {
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK)))
+ list_move(&inode->i_list, &inode_unused);
+ inodes_stat.nr_unused++;
+ if (!sb || (sb->s_flags & MS_ACTIVE)) {
+ spin_unlock(&inode_lock);
+ return;
+ }
+ inode->i_state |= I_WILL_FREE;
spin_unlock(&inode_lock);
- return;
+ /*
+ * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
+ * in our backing_dev_info.
+ */
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inode->i_state &= ~I_WILL_FREE;
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
}
-
- /* write_inode_now() ? */
- inodes_stat.nr_unused--;
- hlist_del_init(&inode->i_hash);
-out_truncate:
list_del_init(&inode->i_list);
list_del_init(&inode->i_sb_list);
inode->i_state |= I_FREEING;
@@ -277,13 +300,6 @@ out_truncate:
spin_unlock(&inode_lock);
if (inode->i_data.nrpages)
truncate_hugepages(&inode->i_data, 0);
-
- if (sbinfo->free_inodes >= 0) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
-
clear_inode(inode);
destroy_inode(inode);
}
@@ -291,7 +307,7 @@ out_truncate:
static void hugetlbfs_drop_inode(struct inode *inode)
{
if (!inode->i_nlink)
- hugetlbfs_delete_inode(inode);
+ generic_delete_inode(inode);
else
hugetlbfs_forget_inode(inode);
}
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
unsigned long h_vm_pgoff;
- unsigned long v_length;
unsigned long v_offset;
h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
if (h_vm_pgoff >= h_pgoff)
v_offset = 0;
- v_length = vma->vm_end - vma->vm_start;
-
- zap_hugepage_range(vma,
- vma->vm_start + v_offset,
- v_length - v_offset);
+ unmap_hugepage_range(vma,
+ vma->vm_start + v_offset, vma->vm_end);
}
}
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
gid_t gid, int mode, dev_t dev)
{
struct inode *inode;
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
-
- if (sbinfo->free_inodes >= 0) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return NULL;
- }
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
- }
inode = new_inode(sb);
if (inode) {
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
}
}
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+ if (sbinfo->free_inodes >= 0) {
+ spin_lock(&sbinfo->stat_lock);
+ if (unlikely(!sbinfo->free_inodes)) {
+ spin_unlock(&sbinfo->stat_lock);
+ return 0;
+ }
+ sbinfo->free_inodes--;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+
+ return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+ if (sbinfo->free_inodes >= 0) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_inodes++;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+}
+
+
static kmem_cache_t *hugetlbfs_inode_cachep;
static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
+ struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
struct hugetlbfs_inode_info *p;
+ if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
+ return NULL;
p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
- if (!p)
+ if (unlikely(!p)) {
+ hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
+ }
return &p->vfs_inode;
}
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
-{
- struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR)
- inode_init_once(&ei->vfs_inode);
-}
-
static void hugetlbfs_destroy_inode(struct inode *inode)
{
+ hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = {
.set_page_dirty = hugetlbfs_set_page_dirty,
};
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(&ei->vfs_inode);
+}
+
struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = simple_sync_file,
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = {
.alloc_inode = hugetlbfs_alloc_inode,
.destroy_inode = hugetlbfs_destroy_inode,
.statfs = hugetlbfs_statfs,
+ .delete_inode = hugetlbfs_delete_inode,
.drop_inode = hugetlbfs_drop_inode,
.put_super = hugetlbfs_put_super,
};
diff --git a/fs/inode.c b/fs/inode.c
index f80a79ff156..d8d04bd72b5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -475,7 +475,7 @@ static void prune_icache(int nr_to_scan)
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
-static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+static int shrink_icache_memory(int nr, gfp_t gfp_mask)
{
if (nr) {
/*
@@ -1088,6 +1088,7 @@ static void generic_forget_inode(struct inode *inode)
if (inode->i_data.nrpages)
truncate_inode_pages(&inode->i_data, 0);
clear_inode(inode);
+ wake_up_inode(inode);
destroy_inode(inode);
}
diff --git a/fs/inotify.c b/fs/inotify.c
index a37e9fb1da5..9fbaebfdf40 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -176,6 +176,7 @@ static inline void put_inotify_dev(struct inotify_device *dev)
if (atomic_dec_and_test(&dev->count)) {
atomic_dec(&dev->user->inotify_devs);
free_uid(dev->user);
+ idr_destroy(&dev->idr);
kfree(dev);
}
}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7ae2c4fe506..e4b516ac498 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1606,7 +1606,7 @@ int journal_blocks_per_page(struct inode *inode)
* Simple support for retrying memory allocations. Introduced to help to
* debug different VM deadlock avoidance strategies.
*/
-void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry)
+void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
{
return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 49bbc2be3d7..13cb05bf604 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1621,7 +1621,7 @@ out:
* while the data is part of a transaction. Yes?
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, int unused_gfp_mask)
+ struct page *page, gfp_t unused_gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0f224384f17..8210ac16a36 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
#include <linux/jffs2.h>
#include <linux/mtd/mtd.h>
#include <linux/completion.h>
+#include <linux/sched.h>
#include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 996d922e503..316133c626b 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -18,6 +18,8 @@
#include <linux/mtd/mtd.h>
#include <linux/crc32.h>
#include <linux/mtd/nand.h>
+#include <linux/jiffies.h>
+
#include "nodelist.h"
/* For testing write failures */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index eadf319bee2..68000a50ceb 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -74,7 +74,7 @@
static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
-static void dbBackSplit(dmtree_t * tp, int leafno);
+static int dbBackSplit(dmtree_t * tp, int leafno);
static int dbJoin(dmtree_t * tp, int leafno, int newval);
static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
@@ -305,7 +305,6 @@ int dbSync(struct inode *ipbmap)
filemap_fdatawrite(ipbmap->i_mapping);
filemap_fdatawait(ipbmap->i_mapping);
- ipbmap->i_state |= I_DIRTY;
diWriteSpecial(ipbmap, 0);
return (0);
@@ -2467,7 +2466,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
* that it is at the front of a binary buddy system.
*/
if (oldval == NOFREE) {
- dbBackSplit((dmtree_t *) dcp, leafno);
+ rc = dbBackSplit((dmtree_t *) dcp, leafno);
+ if (rc)
+ return rc;
oldval = dcp->stree[ti];
}
dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
@@ -2627,7 +2628,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static void dbBackSplit(dmtree_t * tp, int leafno)
+static int dbBackSplit(dmtree_t * tp, int leafno)
{
int budsz, bud, w, bsz, size;
int cursz;
@@ -2662,7 +2663,10 @@ static void dbBackSplit(dmtree_t * tp, int leafno)
*/
for (w = leafno, bsz = budsz;; bsz <<= 1,
w = (w < bud) ? w : bud) {
- assert(bsz < le32_to_cpu(tp->dmt_nleafs));
+ if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
+ jfs_err("JFS: block map error in dbBackSplit");
+ return -EIO;
+ }
/* determine the buddy.
*/
@@ -2681,7 +2685,11 @@ static void dbBackSplit(dmtree_t * tp, int leafno)
}
}
- assert(leaf[leafno] == size);
+ if (leaf[leafno] != size) {
+ jfs_err("JFS: wrong leaf value in dbBackSplit");
+ return -EIO;
+ }
+ return 0;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 4021d46da7e..28201b194f5 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -57,6 +57,12 @@
#include "jfs_debug.h"
/*
+ * __mark_inode_dirty expects inodes to be hashed. Since we don't want
+ * special inodes in the fileset inode space, we hash them to a dummy head
+ */
+static HLIST_HEAD(aggregate_hash);
+
+/*
* imap locks
*/
/* iag free list lock */
@@ -491,6 +497,8 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* release the page */
release_metapage(mp);
+ hlist_add_head(&ip->i_hash, &aggregate_hash);
+
return (ip);
}
@@ -514,8 +522,6 @@ void diWriteSpecial(struct inode *ip, int secondary)
ino_t inum = ip->i_ino;
struct metapage *mp;
- ip->i_state &= ~I_DIRTY;
-
if (secondary)
address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
else
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 13d7e3f1feb..8a53981f9f2 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
atomic_t io_count;
struct metapage *mp[MPS_PER_PAGE];
};
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
static inline struct metapage *page_to_mp(struct page *page, uint offset)
{
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
if (!a)
return -ENOMEM;
memset(a, 0, sizeof(struct meta_anchor));
- page->private = (unsigned long)a;
+ set_page_private(page, (unsigned long)a);
SetPagePrivate(page);
kmap(page);
}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
a->mp[index] = NULL;
if (--a->mp_count == 0) {
kfree(a);
- page->private = 0;
+ set_page_private(page, 0);
ClearPagePrivate(page);
kunmap(page);
}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
#else
static inline struct metapage *page_to_mp(struct page *page, uint offset)
{
- return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+ return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
}
static inline int insert_metapage(struct page *page, struct metapage *mp)
{
if (mp) {
- page->private = (unsigned long)mp;
+ set_page_private(page, (unsigned long)mp);
SetPagePrivate(page);
kmap(page);
}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
static inline void remove_metapage(struct page *page, struct metapage *mp)
{
- page->private = 0;
+ set_page_private(page, 0);
ClearPagePrivate(page);
kunmap(page);
}
@@ -198,7 +198,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
}
}
-static inline struct metapage *alloc_metapage(unsigned int gfp_mask)
+static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
{
return mempool_alloc(metapage_mempool, gfp_mask);
}
@@ -395,6 +395,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) {
redirty = 1;
+ /*
+ * Make sure this page isn't blocked indefinitely.
+ * If the journal isn't undergoing I/O, push it
+ */
+ if (mp->log && !(mp->log->cflag & logGC_PAGEOUT))
+ jfs_flush_journal(mp->log, 0);
continue;
}
@@ -534,7 +540,7 @@ add_failed:
return -EIO;
}
-static int metapage_releasepage(struct page *page, int gfp_mask)
+static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
{
struct metapage *mp;
int busy = 0;
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 9b71ed2674f..b660c93c92d 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2396,7 +2396,6 @@ static void txUpdateMap(struct tblock * tblk)
*/
if (tblk->xflag & COMMIT_CREATE) {
diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
- ipimap->i_state |= I_DIRTY;
/* update persistent block allocation map
* for the allocation of inode extent;
*/
@@ -2407,7 +2406,6 @@ static void txUpdateMap(struct tblock * tblk)
} else if (tblk->xflag & COMMIT_DELETE) {
ip = tblk->u.ip;
diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
- ipimap->i_state |= I_DIRTY;
iput(ip);
}
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a7fe2f2b969..e72f4ebb6e9 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3516,16 +3516,10 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
/* process entries backward from last index */
index = le16_to_cpu(p->header.nextindex) - 1;
- if (p->header.flag & BT_INTERNAL)
- goto getChild;
-
- /*
- * leaf page
- */
- /* Since this is the rightmost leaf, and we may have already freed
- * a page that was formerly to the right, let's make sure that the
- * next pointer is zero.
+ /* Since this is the rightmost page at this level, and we may have
+ * already freed a page that was formerly to the right, let's make
+ * sure that the next pointer is zero.
*/
if (p->header.next) {
if (log)
@@ -3539,6 +3533,12 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
p->header.next = 0;
}
+ if (p->header.flag & BT_INTERNAL)
+ goto getChild;
+
+ /*
+ * leaf page
+ */
freed = 0;
/* does region covered by leaf page precede Teof ? */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 71bc34b96b2..4226af3ea91 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -442,6 +442,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_nlink = 1;
inode->i_size = sb->s_bdev->bd_inode->i_size;
inode->i_mapping->a_ops = &jfs_metapage_aops;
+ insert_inode_hash(inode);
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->direct_inode = inode;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 82c77df81c5..c4c8601096e 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -173,11 +173,10 @@ nlm_bind_host(struct nlm_host *host)
/* If we've already created an RPC client, check whether
* RPC rebind is required
- * Note: why keep rebinding if we're on a tcp connection?
*/
if ((clnt = host->h_rpcclnt) != NULL) {
xprt = clnt->cl_xprt;
- if (!xprt->stream && time_after_eq(jiffies, host->h_nextrebind)) {
+ if (time_after_eq(jiffies, host->h_nextrebind)) {
clnt->cl_port = 0;
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
dprintk("lockd: next rebind in %ld jiffies\n",
@@ -189,7 +188,6 @@ nlm_bind_host(struct nlm_host *host)
goto forgetit;
xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout);
- xprt->nocong = 1; /* No congestion control for NLM */
xprt->resvport = 1; /* NLM requires a reserved port */
/* Existing NLM servers accept AUTH_UNIX only */
diff --git a/fs/locks.c b/fs/locks.c
index f7daa5f4894..a1e8b224801 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -316,21 +316,22 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
/* POSIX-1996 leaves the case l->l_len < 0 undefined;
POSIX-2001 defines it. */
start += l->l_start;
- end = start + l->l_len - 1;
- if (l->l_len < 0) {
+ if (start < 0)
+ return -EINVAL;
+ fl->fl_end = OFFSET_MAX;
+ if (l->l_len > 0) {
+ end = start + l->l_len - 1;
+ fl->fl_end = end;
+ } else if (l->l_len < 0) {
end = start - 1;
+ fl->fl_end = end;
start += l->l_len;
+ if (start < 0)
+ return -EINVAL;
}
-
- if (start < 0)
- return -EINVAL;
- if (l->l_len > 0 && end < 0)
- return -EOVERFLOW;
-
fl->fl_start = start; /* we record the absolute position */
- fl->fl_end = end;
- if (l->l_len == 0)
- fl->fl_end = OFFSET_MAX;
+ if (fl->fl_end < fl->fl_start)
+ return -EOVERFLOW;
fl->fl_owner = current->files;
fl->fl_pid = current->tgid;
@@ -362,14 +363,21 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
return -EINVAL;
}
- if (((start += l->l_start) < 0) || (l->l_len < 0))
+ start += l->l_start;
+ if (start < 0)
return -EINVAL;
- fl->fl_end = start + l->l_len - 1;
- if (l->l_len > 0 && fl->fl_end < 0)
- return -EOVERFLOW;
+ fl->fl_end = OFFSET_MAX;
+ if (l->l_len > 0) {
+ fl->fl_end = start + l->l_len - 1;
+ } else if (l->l_len < 0) {
+ fl->fl_end = start - 1;
+ start += l->l_len;
+ if (start < 0)
+ return -EINVAL;
+ }
fl->fl_start = start; /* we record the absolute position */
- if (l->l_len == 0)
- fl->fl_end = OFFSET_MAX;
+ if (fl->fl_end < fl->fl_start)
+ return -EOVERFLOW;
fl->fl_owner = current->files;
fl->fl_pid = current->tgid;
@@ -829,12 +837,16 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request)
/* Detect adjacent or overlapping regions (if same lock type)
*/
if (request->fl_type == fl->fl_type) {
+ /* In all comparisons of start vs end, use
+ * "start - 1" rather than "end + 1". If end
+ * is OFFSET_MAX, end + 1 will become negative.
+ */
if (fl->fl_end < request->fl_start - 1)
goto next_lock;
/* If the next lock in the list has entirely bigger
* addresses than the new one, insert the lock here.
*/
- if (fl->fl_start > request->fl_end + 1)
+ if (fl->fl_start - 1 > request->fl_end)
break;
/* If we come here, the new and old lock are of the
diff --git a/fs/mbcache.c b/fs/mbcache.c
index b002a088857..298997f1747 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -116,7 +116,7 @@ mb_cache_indexes(struct mb_cache *cache)
* What the mbcache registers as to get shrunk dynamically.
*/
-static int mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask);
+static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
static inline int
@@ -140,7 +140,7 @@ __mb_cache_entry_unhash(struct mb_cache_entry *ce)
static inline void
-__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
+__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
{
struct mb_cache *cache = ce->e_cache;
@@ -193,7 +193,7 @@ forget:
* Returns the number of objects which are present in the cache.
*/
static int
-mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask)
+mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(free_list);
struct list_head *l, *ltmp;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 154f511c724..626a367bcd8 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -454,10 +454,10 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
{
struct buffer_head *dotdot_bh;
struct msdos_dir_entry *dotdot_de;
- loff_t dotdot_i_pos;
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec ts;
+ loff_t dotdot_i_pos, new_i_pos;
int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -516,28 +516,24 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
if (new_inode) {
if (err)
goto out;
- if (MSDOS_I(new_inode)->i_pos != sinfo.i_pos) {
- /* WTF??? Cry and fail. */
- printk(KERN_WARNING "msdos_rename: fs corrupted\n");
- goto out;
- }
-
if (is_dir) {
err = fat_dir_empty(new_inode);
if (err)
goto out;
}
+ new_i_pos = MSDOS_I(new_inode)->i_pos;
fat_detach(new_inode);
} else {
err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
&ts, &sinfo);
if (err)
goto out;
+ new_i_pos = sinfo.i_pos;
}
new_dir->i_version++;
fat_detach(old_inode);
- fat_attach(old_inode, sinfo.i_pos);
+ fat_attach(old_inode, new_i_pos);
if (is_hid)
MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
else
@@ -604,7 +600,7 @@ error_inode:
fat_attach(old_inode, old_sinfo.i_pos);
MSDOS_I(old_inode)->i_attrs = old_attrs;
if (new_inode) {
- fat_attach(new_inode, sinfo.i_pos);
+ fat_attach(new_inode, new_i_pos);
if (corrupt)
corrupt |= fat_sync_inode(new_inode);
} else {
diff --git a/fs/namei.c b/fs/namei.c
index aa62dbda93a..c5769c4fcab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -28,6 +28,7 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/file.h>
#include <asm/namei.h>
#include <asm/uaccess.h>
@@ -317,6 +318,18 @@ void path_release_on_umount(struct nameidata *nd)
mntput_no_expire(nd->mnt);
}
+/**
+ * release_open_intent - free up open intent resources
+ * @nd: pointer to nameidata
+ */
+void release_open_intent(struct nameidata *nd)
+{
+ if (nd->intent.open.file->f_dentry == NULL)
+ put_filp(nd->intent.open.file);
+ else
+ fput(nd->intent.open.file);
+}
+
/*
* Internal lookup() using the new generic dcache.
* SMP-safe
@@ -750,6 +763,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
struct qstr this;
unsigned int c;
+ nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode, nd);
if (err == -EAGAIN) {
err = permission(inode, MAY_EXEC, nd);
@@ -802,7 +816,6 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
if (err < 0)
break;
}
- nd->flags |= LOOKUP_CONTINUE;
/* This does the actual lookups.. */
err = do_lookup(nd, &this, &next);
if (err)
@@ -1052,6 +1065,70 @@ out:
return retval;
}
+static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags, int create_mode)
+{
+ struct file *filp = get_empty_filp();
+ int err;
+
+ if (filp == NULL)
+ return -ENFILE;
+ nd->intent.open.file = filp;
+ nd->intent.open.flags = open_flags;
+ nd->intent.open.create_mode = create_mode;
+ err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd);
+ if (IS_ERR(nd->intent.open.file)) {
+ if (err == 0) {
+ err = PTR_ERR(nd->intent.open.file);
+ path_release(nd);
+ }
+ } else if (err != 0)
+ release_open_intent(nd);
+ return err;
+}
+
+/**
+ * path_lookup_open - lookup a file path with open intent
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ */
+int path_lookup_open(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags)
+{
+ return __path_lookup_intent_open(name, lookup_flags, nd,
+ open_flags, 0);
+}
+
+/**
+ * path_lookup_create - lookup a file path with open + create intent
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ * @create_mode: create intent flags
+ */
+int path_lookup_create(const char *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags, int create_mode)
+{
+ return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd,
+ open_flags, create_mode);
+}
+
+int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
+ struct nameidata *nd, int open_flags)
+{
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+
+ if (!IS_ERR(tmp)) {
+ err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0);
+ putname(tmp);
+ }
+ return err;
+}
+
/*
* Restricted form of lookup. Doesn't follow links, single-component only,
* needs parent already locked. Doesn't follow mounts.
@@ -1234,9 +1311,6 @@ static inline int may_create(struct inode *dir, struct dentry *child,
}
/*
- * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
- * reasons.
- *
* O_DIRECTORY translates into forcing a directory lookup.
*/
static inline int lookup_flags(unsigned int f)
@@ -1246,9 +1320,6 @@ static inline int lookup_flags(unsigned int f)
if (f & O_NOFOLLOW)
retval &= ~LOOKUP_FOLLOW;
- if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
- retval &= ~LOOKUP_FOLLOW;
-
if (f & O_DIRECTORY)
retval |= LOOKUP_DIRECTORY;
@@ -1416,27 +1487,27 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
*/
int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
{
- int acc_mode, error = 0;
+ int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
acc_mode = ACC_MODE(flag);
+ /* O_TRUNC implies we need access checks for write permissions */
+ if (flag & O_TRUNC)
+ acc_mode |= MAY_WRITE;
+
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flag & O_APPEND)
acc_mode |= MAY_APPEND;
- /* Fill in the open() intent data */
- nd->intent.open.flags = flag;
- nd->intent.open.create_mode = mode;
-
/*
* The simplest case - just a plain lookup.
*/
if (!(flag & O_CREAT)) {
- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
+ error = path_lookup_open(pathname, lookup_flags(flag), nd, flag);
if (error)
return error;
goto ok;
@@ -1445,7 +1516,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
/*
* Create - we need to know the parent.
*/
- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
+ error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode);
if (error)
return error;
@@ -1520,6 +1591,8 @@ ok:
exit_dput:
dput_path(&path, nd);
exit:
+ if (!IS_ERR(nd->intent.open.file))
+ release_open_intent(nd);
path_release(nd);
return error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4a36839f0bb..44135af9894 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -142,7 +142,7 @@ static void nfs_msync_inode(struct inode *inode)
/*
* Basic procedure for returning a delegation to the server
*/
-int nfs_inode_return_delegation(struct inode *inode)
+int __nfs_inode_return_delegation(struct inode *inode)
{
struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 3f6c45a29d6..8017846b561 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -25,7 +25,7 @@ struct nfs_delegation {
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
-int nfs_inode_return_delegation(struct inode *inode);
+int __nfs_inode_return_delegation(struct inode *inode);
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
@@ -47,11 +47,25 @@ static inline int nfs_have_delegation(struct inode *inode, int flags)
return 1;
return 0;
}
+
+static inline int nfs_inode_return_delegation(struct inode *inode)
+{
+ int err = 0;
+
+ if (NFS_I(inode)->delegation != NULL)
+ err = __nfs_inode_return_delegation(inode);
+ return err;
+}
#else
static inline int nfs_have_delegation(struct inode *inode, int flags)
{
return 0;
}
+
+static inline int nfs_inode_return_delegation(struct inode *inode)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2df639f143e..8272ed3fc70 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -532,6 +532,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
my_entry.eof = 0;
my_entry.fh = &fh;
my_entry.fattr = &fattr;
+ nfs_fattr_init(&fattr);
desc->entry = &my_entry;
while(!desc->entry->eof) {
@@ -565,8 +566,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
}
unlock_kernel();
- if (desc->error < 0)
- return desc->error;
if (res < 0)
return res;
return 0;
@@ -803,6 +802,7 @@ static int nfs_dentry_delete(struct dentry *dentry)
*/
static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
{
+ nfs_inode_return_delegation(inode);
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
lock_kernel();
inode->i_nlink--;
@@ -853,12 +853,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
lock_kernel();
- /* Revalidate parent directory attribute cache */
- error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
- if (error < 0) {
- res = ERR_PTR(error);
- goto out_unlock;
- }
/* If we're doing an exclusive create, optimize away the lookup */
if (nfs_is_exclusive_create(dir, nd))
@@ -916,7 +910,6 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
struct dentry *res = NULL;
- struct inode *inode = NULL;
int error;
/* Check that we are indeed trying to open this file */
@@ -930,8 +923,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
/* Let vfs_create() deal with O_EXCL */
- if (nd->intent.open.flags & O_EXCL)
- goto no_entry;
+ if (nd->intent.open.flags & O_EXCL) {
+ d_add(dentry, NULL);
+ goto out;
+ }
/* Open the file on the server */
lock_kernel();
@@ -945,32 +940,30 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
if (nd->intent.open.flags & O_CREAT) {
nfs_begin_data_update(dir);
- inode = nfs4_atomic_open(dir, dentry, nd);
+ res = nfs4_atomic_open(dir, dentry, nd);
nfs_end_data_update(dir);
} else
- inode = nfs4_atomic_open(dir, dentry, nd);
+ res = nfs4_atomic_open(dir, dentry, nd);
unlock_kernel();
- if (IS_ERR(inode)) {
- error = PTR_ERR(inode);
+ if (IS_ERR(res)) {
+ error = PTR_ERR(res);
switch (error) {
/* Make a negative dentry */
case -ENOENT:
- inode = NULL;
- break;
+ res = NULL;
+ goto out;
/* This turned out not to be a regular file */
+ case -EISDIR:
+ case -ENOTDIR:
+ goto no_open;
case -ELOOP:
if (!(nd->intent.open.flags & O_NOFOLLOW))
goto no_open;
- /* case -EISDIR: */
/* case -EINVAL: */
default:
- res = ERR_PTR(error);
goto out;
}
- }
-no_entry:
- res = d_add_unique(dentry, inode);
- if (res != NULL)
+ } else if (res != NULL)
dentry = res;
nfs_renew_times(dentry);
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1014,7 +1007,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
*/
lock_kernel();
verifier = nfs_save_change_attribute(dir);
- ret = nfs4_open_revalidate(dir, dentry, openflags);
+ ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
if (!ret)
nfs_set_verifier(dentry, verifier);
unlock_kernel();
@@ -1137,7 +1130,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
lock_kernel();
nfs_begin_data_update(dir);
- error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
nfs_end_data_update(dir);
if (error != 0)
goto out_err;
@@ -1332,6 +1325,7 @@ static int nfs_safe_remove(struct dentry *dentry)
nfs_begin_data_update(dir);
if (inode != NULL) {
+ nfs_inode_return_delegation(inode);
nfs_begin_data_update(inode);
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
/* The VFS may want to delete this inode */
@@ -1438,17 +1432,14 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
- /*
- * Drop the dentry in advance to force a new lookup.
- * Since nfs_proc_link doesn't return a file handle,
- * we can't use the existing dentry.
- */
lock_kernel();
- d_drop(dentry);
-
nfs_begin_data_update(dir);
nfs_begin_data_update(inode);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+ if (error == 0) {
+ atomic_inc(&inode->i_count);
+ d_instantiate(dentry, inode);
+ }
nfs_end_data_update(inode);
nfs_end_data_update(dir);
unlock_kernel();
@@ -1512,9 +1503,11 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
if (!new_inode)
goto go_ahead;
- if (S_ISDIR(new_inode->i_mode))
- goto out;
- else if (atomic_read(&new_dentry->d_count) > 2) {
+ if (S_ISDIR(new_inode->i_mode)) {
+ error = -EISDIR;
+ if (!S_ISDIR(old_inode->i_mode))
+ goto out;
+ } else if (atomic_read(&new_dentry->d_count) > 2) {
int err;
/* copy the target dentry's name */
dentry = d_alloc(new_dentry->d_parent,
@@ -1539,7 +1532,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
#endif
goto out;
}
- }
+ } else
+ new_inode->i_nlink--;
go_ahead:
/*
@@ -1549,6 +1543,7 @@ go_ahead:
nfs_wb_all(old_inode);
shrink_dcache_parent(old_dentry);
}
+ nfs_inode_return_delegation(old_inode);
if (new_inode)
d_delete(new_dentry);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6bdcfa95de9..57d3e77d97e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -205,8 +205,8 @@ nfs_file_flush(struct file *file)
if (!status) {
status = ctx->error;
ctx->error = 0;
- if (!status && !nfs_have_delegation(inode, FMODE_READ))
- __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (!status)
+ nfs_revalidate_inode(NFS_SERVER(inode), inode);
}
unlock_kernel();
return status;
@@ -376,22 +376,31 @@ out_swapfile:
static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
{
+ struct file_lock *cfl;
struct inode *inode = filp->f_mapping->host;
int status = 0;
lock_kernel();
- /* Use local locking if mounted with "-onolock" */
- if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
- status = NFS_PROTO(inode)->lock(filp, cmd, fl);
- else {
- struct file_lock *cfl = posix_test_lock(filp, fl);
-
- fl->fl_type = F_UNLCK;
- if (cfl != NULL)
- memcpy(fl, cfl, sizeof(*fl));
+ /* Try local locking first */
+ cfl = posix_test_lock(filp, fl);
+ if (cfl != NULL) {
+ locks_copy_lock(fl, cfl);
+ goto out;
}
+
+ if (nfs_have_delegation(inode, FMODE_READ))
+ goto out_noconflict;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
+ goto out_noconflict;
+
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
unlock_kernel();
return status;
+out_noconflict:
+ fl->fl_type = F_UNLCK;
+ goto out;
}
static int do_vfs_lock(struct file *file, struct file_lock *fl)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d4eadeea128..fc0f12ba89c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -358,6 +358,35 @@ out_no_root:
return no_root_error;
}
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+{
+ to->to_initval = timeo * HZ / 10;
+ to->to_retries = retrans;
+ if (!to->to_retries)
+ to->to_retries = 2;
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ if (!to->to_initval)
+ to->to_initval = 60 * HZ;
+ if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+ to->to_initval = NFS_MAX_TCP_TIMEOUT;
+ to->to_increment = to->to_initval;
+ to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+ to->to_exponential = 0;
+ break;
+ case IPPROTO_UDP:
+ default:
+ if (!to->to_initval)
+ to->to_initval = 11 * HZ / 10;
+ if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+ to->to_initval = NFS_MAX_UDP_TIMEOUT;
+ to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+ to->to_exponential = 1;
+ break;
+ }
+}
+
/*
* Create an RPC client handle.
*/
@@ -367,22 +396,12 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
struct rpc_timeout timeparms;
struct rpc_xprt *xprt = NULL;
struct rpc_clnt *clnt = NULL;
- int tcp = (data->flags & NFS_MOUNT_TCP);
+ int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
- /* Initialize timeout values */
- timeparms.to_initval = data->timeo * HZ / 10;
- timeparms.to_retries = data->retrans;
- timeparms.to_maxval = tcp ? RPC_MAX_TCP_TIMEOUT : RPC_MAX_UDP_TIMEOUT;
- timeparms.to_exponential = 1;
-
- if (!timeparms.to_initval)
- timeparms.to_initval = (tcp ? 600 : 11) * HZ / 10;
- if (!timeparms.to_retries)
- timeparms.to_retries = 5;
+ nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
/* create transport and client */
- xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP,
- &server->addr, &timeparms);
+ xprt = xprt_create_proto(proto, &server->addr, &timeparms);
if (IS_ERR(xprt)) {
dprintk("%s: cannot create RPC transport. Error = %ld\n",
__FUNCTION__, PTR_ERR(xprt));
@@ -576,7 +595,6 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
{ NFS_MOUNT_SOFT, ",soft", ",hard" },
{ NFS_MOUNT_INTR, ",intr", "" },
{ NFS_MOUNT_POSIX, ",posix", "" },
- { NFS_MOUNT_TCP, ",tcp", ",udp" },
{ NFS_MOUNT_NOCTO, ",nocto", "" },
{ NFS_MOUNT_NOAC, ",noac", "" },
{ NFS_MOUNT_NONLM, ",nolock", ",lock" },
@@ -585,6 +603,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
};
struct proc_nfs_info *nfs_infop;
struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+ char buf[12];
+ char *proto;
seq_printf(m, ",v%d", nfss->rpc_ops->version);
seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -603,6 +623,18 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
else
seq_puts(m, nfs_infop->nostr);
}
+ switch (nfss->client->cl_xprt->prot) {
+ case IPPROTO_TCP:
+ proto = "tcp";
+ break;
+ case IPPROTO_UDP:
+ proto = "udp";
+ break;
+ default:
+ snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
+ proto = buf;
+ }
+ seq_printf(m, ",proto=%s", proto);
seq_puts(m, ",addr=");
seq_escape(m, nfss->hostname, " \t\n\\");
return 0;
@@ -753,7 +785,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else
init_special_inode(inode, inode->i_mode, fattr->rdev);
- nfsi->read_cache_jiffies = fattr->timestamp;
+ nfsi->read_cache_jiffies = fattr->time_start;
+ nfsi->last_updated = jiffies;
inode->i_atime = fattr->atime;
inode->i_mtime = fattr->mtime;
inode->i_ctime = fattr->ctime;
@@ -821,6 +854,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
filemap_fdatawait(inode->i_mapping);
nfs_wb_all(inode);
}
+ /*
+ * Return any delegations if we're going to change ACLs
+ */
+ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+ nfs_inode_return_delegation(inode);
error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
if (error == 0)
nfs_refresh_inode(inode, &fattr);
@@ -1019,15 +1057,11 @@ int nfs_open(struct inode *inode, struct file *filp)
ctx->mode = filp->f_mode;
nfs_file_set_open_context(filp, ctx);
put_nfs_open_context(ctx);
- if ((filp->f_mode & FMODE_WRITE) != 0)
- nfs_begin_data_update(inode);
return 0;
}
int nfs_release(struct inode *inode, struct file *filp)
{
- if ((filp->f_mode & FMODE_WRITE) != 0)
- nfs_end_data_update(inode);
nfs_file_clear_open_context(filp);
return 0;
}
@@ -1083,14 +1117,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
}
+ spin_lock(&inode->i_lock);
status = nfs_update_inode(inode, &fattr, verifier);
if (status) {
+ spin_unlock(&inode->i_lock);
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
goto out;
}
- spin_lock(&inode->i_lock);
cache_validity = nfsi->cache_validity;
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
@@ -1098,7 +1133,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
* We may need to keep the attributes marked as invalid if
* we raced with nfs_end_attr_update().
*/
- if (verifier == nfsi->cache_change_attribute)
+ if (time_after_eq(verifier, nfsi->cache_change_attribute))
nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
spin_unlock(&inode->i_lock);
@@ -1165,7 +1200,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if (S_ISDIR(inode->i_mode)) {
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
/* This ensures we revalidate child dentries */
- nfsi->cache_change_attribute++;
+ nfsi->cache_change_attribute = jiffies;
}
spin_unlock(&inode->i_lock);
@@ -1197,20 +1232,19 @@ void nfs_end_data_update(struct inode *inode)
struct nfs_inode *nfsi = NFS_I(inode);
if (!nfs_have_delegation(inode, FMODE_READ)) {
- /* Mark the attribute cache for revalidation */
- spin_lock(&inode->i_lock);
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
- /* Directories and symlinks: invalidate page cache too */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ /* Directories and symlinks: invalidate page cache */
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
+ }
}
- nfsi->cache_change_attribute ++;
+ nfsi->cache_change_attribute = jiffies;
atomic_dec(&nfsi->data_updates);
}
/**
- * nfs_refresh_inode - verify consistency of the inode attribute cache
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
* @inode - pointer to inode
* @fattr - updated attributes
*
@@ -1218,13 +1252,12 @@ void nfs_end_data_update(struct inode *inode)
* so that fattr carries weak cache consistency data, then it may
* also update the ctime/mtime/change_attribute.
*/
-int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
{
struct nfs_inode *nfsi = NFS_I(inode);
loff_t cur_size, new_isize;
int data_unstable;
- spin_lock(&inode->i_lock);
/* Are we in the process of updating data on the server? */
data_unstable = nfs_caches_unstable(inode);
@@ -1241,14 +1274,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
}
if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
- spin_unlock(&inode->i_lock);
return 0;
}
/* Has the inode gone and changed behind our back? */
if (nfsi->fileid != fattr->fileid
|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
- spin_unlock(&inode->i_lock);
return -EIO;
}
@@ -1288,11 +1319,67 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
if (!timespec_equal(&inode->i_atime, &fattr->atime))
nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
- nfsi->read_cache_jiffies = fattr->timestamp;
- spin_unlock(&inode->i_lock);
+ nfsi->read_cache_jiffies = fattr->time_start;
return 0;
}
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int status;
+
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+ if (nfs_verify_change_attribute(inode, fattr->time_start))
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+ if (time_after(fattr->time_start, nfsi->last_updated))
+ status = nfs_update_inode(inode, fattr, fattr->time_start);
+ else
+ status = nfs_check_inode_attributes(inode, fattr);
+
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int status = 0;
+
+ spin_lock(&inode->i_lock);
+ if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+ goto out;
+ }
+ status = nfs_update_inode(inode, fattr, fattr->time_start);
+ if (time_after_eq(fattr->time_start, nfsi->cache_change_attribute))
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
+ nfsi->cache_change_attribute = jiffies;
+out:
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+
/*
* Many nfs protocol calls return the new file attributes after
* an operation. Here we update the inode to reflect the state
@@ -1328,20 +1415,17 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
goto out_err;
}
- spin_lock(&inode->i_lock);
-
/*
* Make sure the inode's type hasn't changed.
*/
- if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
- spin_unlock(&inode->i_lock);
+ if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
goto out_changed;
- }
/*
* Update the read time so we don't revalidate too often.
*/
- nfsi->read_cache_jiffies = fattr->timestamp;
+ nfsi->read_cache_jiffies = fattr->time_start;
+ nfsi->last_updated = jiffies;
/* Are we racing with known updates of the metadata on the server? */
data_unstable = ! (nfs_verify_change_attribute(inode, verifier) ||
@@ -1354,7 +1438,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
/* Do we perhaps have any outstanding writes? */
if (nfsi->npages == 0) {
/* No, but did we race with nfs_end_data_update()? */
- if (verifier == nfsi->cache_change_attribute) {
+ if (time_after_eq(verifier, nfsi->cache_change_attribute)) {
inode->i_size = new_isize;
invalid |= NFS_INO_INVALID_DATA;
}
@@ -1430,7 +1514,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
if (!nfs_have_delegation(inode, FMODE_READ))
nfsi->cache_validity |= invalid;
- spin_unlock(&inode->i_lock);
return 0;
out_changed:
/*
@@ -1639,8 +1722,7 @@ static void nfs4_clear_inode(struct inode *inode)
struct nfs_inode *nfsi = NFS_I(inode);
/* If we are holding a delegation, return it! */
- if (nfsi->delegation != NULL)
- nfs_inode_return_delegation(inode);
+ nfs_inode_return_delegation(inode);
/* First call standard NFS clear_inode() code */
nfs_clear_inode(inode);
/* Now clear out any remaining state */
@@ -1669,7 +1751,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
struct rpc_clnt *clnt = NULL;
struct rpc_timeout timeparms;
rpc_authflavor_t authflavour;
- int proto, err = -EIO;
+ int err = -EIO;
sb->s_blocksize_bits = 0;
sb->s_blocksize = 0;
@@ -1687,30 +1769,8 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
server->acdirmax = data->acdirmax*HZ;
server->rpc_ops = &nfs_v4_clientops;
- /* Initialize timeout values */
-
- timeparms.to_initval = data->timeo * HZ / 10;
- timeparms.to_retries = data->retrans;
- timeparms.to_exponential = 1;
- if (!timeparms.to_retries)
- timeparms.to_retries = 5;
- proto = data->proto;
- /* Which IP protocol do we use? */
- switch (proto) {
- case IPPROTO_TCP:
- timeparms.to_maxval = RPC_MAX_TCP_TIMEOUT;
- if (!timeparms.to_initval)
- timeparms.to_initval = 600 * HZ / 10;
- break;
- case IPPROTO_UDP:
- timeparms.to_maxval = RPC_MAX_UDP_TIMEOUT;
- if (!timeparms.to_initval)
- timeparms.to_initval = 11 * HZ / 10;
- break;
- default:
- return -EINVAL;
- }
+ nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
clp = nfs4_get_client(&server->addr.sin_addr);
if (!clp) {
@@ -1735,7 +1795,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
down_write(&clp->cl_sem);
if (IS_ERR(clp->cl_rpcclient)) {
- xprt = xprt_create_proto(proto, &server->addr, &timeparms);
+ xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
if (IS_ERR(xprt)) {
up_write(&clp->cl_sem);
err = PTR_ERR(xprt);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index d91b69044a4..59049e864ca 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -143,7 +143,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
fattr->rdev = 0;
}
- fattr->timestamp = jiffies;
return p;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index edc95514046..92c870d19cc 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -78,7 +78,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("%s: call fsinfo\n", __FUNCTION__);
- info->fattr->valid = 0;
+ nfs_fattr_init(info->fattr);
status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
@@ -98,7 +98,7 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call getattr\n");
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(server->client, NFS3PROC_GETATTR,
fhandle, fattr, 0);
dprintk("NFS reply getattr: %d\n", status);
@@ -117,7 +117,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;
dprintk("NFS call setattr\n");
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
@@ -143,8 +143,8 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
int status;
dprintk("NFS call lookup %s\n", name->name);
- dir_attr.valid = 0;
- fattr->valid = 0;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(fattr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0);
if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR))
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR,
@@ -174,7 +174,6 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
int status;
dprintk("NFS call access\n");
- fattr.valid = 0;
if (mode & MAY_READ)
arg.access |= NFS3_ACCESS_READ;
@@ -189,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
if (mode & MAY_EXEC)
arg.access |= NFS3_ACCESS_EXECUTE;
}
+ nfs_fattr_init(&fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
nfs_refresh_inode(inode, &fattr);
if (status == 0) {
@@ -217,7 +217,7 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
int status;
dprintk("NFS call readlink\n");
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK,
&args, &fattr, 0);
nfs_refresh_inode(inode, &fattr);
@@ -240,7 +240,7 @@ static int nfs3_proc_read(struct nfs_read_data *rdata)
dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
(long long) rdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
if (status >= 0)
nfs_refresh_inode(inode, fattr);
@@ -263,10 +263,10 @@ static int nfs3_proc_write(struct nfs_write_data *wdata)
dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
(long long) wdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags);
if (status >= 0)
- nfs_refresh_inode(inode, fattr);
+ nfs_post_op_update_inode(inode, fattr);
dprintk("NFS reply write: %d\n", status);
return status < 0? status : wdata->res.count;
}
@@ -285,10 +285,10 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata)
dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
(long long) cdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status >= 0)
- nfs_refresh_inode(inode, fattr);
+ nfs_post_op_update_inode(inode, fattr);
dprintk("NFS reply commit: %d\n", status);
return status;
}
@@ -299,7 +299,7 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata)
*/
static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
- int flags)
+ int flags, struct nameidata *nd)
{
struct nfs_fh fhandle;
struct nfs_fattr fattr;
@@ -329,10 +329,10 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
sattr->ia_mode &= ~current->fs->umask;
again:
- dir_attr.valid = 0;
- fattr.valid = 0;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
/* If the server doesn't support the exclusive creation semantics,
* try again with simple 'guarded' mode. */
@@ -401,9 +401,9 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
int status;
dprintk("NFS call remove %s\n", name->name);
- dir_attr.valid = 0;
+ nfs_fattr_init(&dir_attr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
dprintk("NFS reply remove: %d\n", status);
return status;
}
@@ -422,7 +422,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr
ptr->arg.fh = NFS_FH(dir->d_inode);
ptr->arg.name = name->name;
ptr->arg.len = name->len;
- ptr->res.valid = 0;
+ nfs_fattr_init(&ptr->res);
msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
msg->rpc_argp = &ptr->arg;
msg->rpc_resp = &ptr->res;
@@ -439,7 +439,7 @@ nfs3_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
return 1;
if (msg->rpc_argp) {
dir_attr = (struct nfs_fattr*)msg->rpc_resp;
- nfs_refresh_inode(dir->d_inode, dir_attr);
+ nfs_post_op_update_inode(dir->d_inode, dir_attr);
kfree(msg->rpc_argp);
}
return 0;
@@ -465,11 +465,11 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
int status;
dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
- old_dir_attr.valid = 0;
- new_dir_attr.valid = 0;
+ nfs_fattr_init(&old_dir_attr);
+ nfs_fattr_init(&new_dir_attr);
status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0);
- nfs_refresh_inode(old_dir, &old_dir_attr);
- nfs_refresh_inode(new_dir, &new_dir_attr);
+ nfs_post_op_update_inode(old_dir, &old_dir_attr);
+ nfs_post_op_update_inode(new_dir, &new_dir_attr);
dprintk("NFS reply rename: %d\n", status);
return status;
}
@@ -491,11 +491,11 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
int status;
dprintk("NFS call link %s\n", name->name);
- dir_attr.valid = 0;
- fattr.valid = 0;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0);
- nfs_refresh_inode(dir, &dir_attr);
- nfs_refresh_inode(inode, &fattr);
+ nfs_post_op_update_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(inode, &fattr);
dprintk("NFS reply link: %d\n", status);
return status;
}
@@ -524,10 +524,10 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
if (path->len > NFS3_MAXPATHLEN)
return -ENAMETOOLONG;
dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
- dir_attr.valid = 0;
- fattr->valid = 0;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(fattr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
dprintk("NFS reply symlink: %d\n", status);
return status;
}
@@ -552,13 +552,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
int status;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
- dir_attr.valid = 0;
- fattr.valid = 0;
sattr->ia_mode &= ~current->fs->umask;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
if (status != 0)
goto out;
status = nfs_instantiate(dentry, &fhandle, &fattr);
@@ -582,9 +582,9 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
int status;
dprintk("NFS call rmdir %s\n", name->name);
- dir_attr.valid = 0;
+ nfs_fattr_init(&dir_attr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
dprintk("NFS reply rmdir: %d\n", status);
return status;
}
@@ -634,7 +634,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
dprintk("NFS call readdir%s %d\n",
plus? "plus" : "", (unsigned int) cookie);
- dir_attr.valid = 0;
+ nfs_fattr_init(&dir_attr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_refresh_inode(dir, &dir_attr);
dprintk("NFS reply readdir: %d\n", status);
@@ -676,10 +676,10 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
sattr->ia_mode &= ~current->fs->umask;
- dir_attr.valid = 0;
- fattr.valid = 0;
+ nfs_fattr_init(&dir_attr);
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, &dir_attr);
if (status != 0)
goto out;
status = nfs_instantiate(dentry, &fh, &fattr);
@@ -698,7 +698,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call fsstat\n");
- stat->fattr->valid = 0;
+ nfs_fattr_init(stat->fattr);
status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0);
dprintk("NFS reply statfs: %d\n", status);
return status;
@@ -711,7 +711,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call fsinfo\n");
- info->fattr->valid = 0;
+ nfs_fattr_init(info->fattr);
status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
dprintk("NFS reply fsinfo: %d\n", status);
return status;
@@ -724,7 +724,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call pathconf\n");
- info->fattr->valid = 0;
+ nfs_fattr_init(info->fattr);
status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0);
dprintk("NFS reply pathconf: %d\n", status);
return status;
@@ -735,7 +735,7 @@ extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
static void
nfs3_read_done(struct rpc_task *task)
{
- struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+ struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
if (nfs3_async_handle_jukebox(task))
return;
@@ -775,7 +775,7 @@ nfs3_write_done(struct rpc_task *task)
return;
data = (struct nfs_write_data *)task->tk_calldata;
if (task->tk_status >= 0)
- nfs_refresh_inode(data->inode, data->res.fattr);
+ nfs_post_op_update_inode(data->inode, data->res.fattr);
nfs_writeback_done(task);
}
@@ -819,7 +819,7 @@ nfs3_commit_done(struct rpc_task *task)
return;
data = (struct nfs_write_data *)task->tk_calldata;
if (task->tk_status >= 0)
- nfs_refresh_inode(data->inode, data->res.fattr);
+ nfs_post_op_update_inode(data->inode, data->res.fattr);
nfs_commit_done(task);
}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index db4a904810a..0498bd36602 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -174,7 +174,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
/* Update the mode bits */
fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
- fattr->timestamp = jiffies;
return p;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ec1a22d7b87..78a53f5a9f1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -93,25 +93,50 @@ struct nfs4_client {
};
/*
+ * struct rpc_sequence ensures that RPC calls are sent in the exact
+ * order that they appear on the list.
+ */
+struct rpc_sequence {
+ struct rpc_wait_queue wait; /* RPC call delay queue */
+ spinlock_t lock; /* Protects the list */
+ struct list_head list; /* Defines sequence of RPC calls */
+};
+
+#define NFS_SEQID_CONFIRMED 1
+struct nfs_seqid_counter {
+ struct rpc_sequence *sequence;
+ int flags;
+ u32 counter;
+};
+
+struct nfs_seqid {
+ struct nfs_seqid_counter *sequence;
+ struct list_head list;
+};
+
+static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
+{
+ if (seqid_mutating_err(-status))
+ seqid->flags |= NFS_SEQID_CONFIRMED;
+}
+
+/*
* NFS4 state_owners and lock_owners are simply labels for ordered
* sequences of RPC calls. Their sole purpose is to provide once-only
* semantics by allowing the server to identify replayed requests.
- *
- * The ->so_sema is held during all state_owner seqid-mutating operations:
- * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize
- * so_seqid.
*/
struct nfs4_state_owner {
+ spinlock_t so_lock;
struct list_head so_list; /* per-clientid list of state_owners */
struct nfs4_client *so_client;
u32 so_id; /* 32-bit identifier, unique */
- struct semaphore so_sema;
- u32 so_seqid; /* protected by so_sema */
atomic_t so_count;
struct rpc_cred *so_cred; /* Associated cred */
struct list_head so_states;
struct list_head so_delegations;
+ struct nfs_seqid_counter so_seqid;
+ struct rpc_sequence so_sequence;
};
/*
@@ -132,7 +157,7 @@ struct nfs4_lock_state {
fl_owner_t ls_owner; /* POSIX lock owner */
#define NFS_LOCK_INITIALIZED 1
int ls_flags;
- u32 ls_seqid;
+ struct nfs_seqid_counter ls_seqid;
u32 ls_id;
nfs4_stateid ls_stateid;
atomic_t ls_count;
@@ -153,7 +178,6 @@ struct nfs4_state {
struct inode *inode; /* Pointer to the inode */
unsigned long flags; /* Do we hold any locks? */
- struct semaphore lock_sema; /* Serializes file locking operations */
spinlock_t state_lock; /* Protects the lock_states list */
nfs4_stateid stateid;
@@ -191,8 +215,8 @@ extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
extern int nfs4_proc_async_renew(struct nfs4_client *);
extern int nfs4_proc_renew(struct nfs4_client *);
extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
-extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
+extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -224,12 +248,17 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, mode_t);
extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
-extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
+extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_free_seqid(struct nfs_seqid *seqid);
+
extern const nfs4_stateid zero_stateid;
/* nfs4xdr.c */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9701ca8c942..933e13b383f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -47,6 +47,7 @@
#include <linux/nfs_page.h>
#include <linux/smp_lock.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include "nfs4_fs.h"
#include "delegation.h"
@@ -56,10 +57,11 @@
#define NFS4_POLL_RETRY_MIN (1*HZ)
#define NFS4_POLL_RETRY_MAX (15*HZ)
+static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
-static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
extern struct rpc_procinfo nfs4_procedures[];
@@ -185,8 +187,26 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
{
struct nfs_inode *nfsi = NFS_I(inode);
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (cinfo->before == nfsi->change_attr && cinfo->atomic)
nfsi->change_attr = cinfo->after;
+ spin_unlock(&inode->i_lock);
+}
+
+/* Helper for asynchronous RPC calls */
+static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
+ rpc_action tk_exit, void *calldata)
+{
+ struct rpc_task *task;
+
+ if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC)))
+ return -ENOMEM;
+
+ task->tk_calldata = calldata;
+ task->tk_action = tk_begin;
+ rpc_execute(task);
+ return 0;
}
static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
@@ -195,6 +215,7 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
open_flags &= (FMODE_READ|FMODE_WRITE);
/* Protect against nfs4_find_state() */
+ spin_lock(&state->owner->so_lock);
spin_lock(&inode->i_lock);
state->state |= open_flags;
/* NB! List reordering - see the reclaim code for why. */
@@ -204,12 +225,12 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
state->nreaders++;
memcpy(&state->stateid, stateid, sizeof(state->stateid));
spin_unlock(&inode->i_lock);
+ spin_unlock(&state->owner->so_lock);
}
/*
* OPEN_RECLAIM:
* reclaim state on the server after a reboot.
- * Assumes caller is holding the sp->so_sem
*/
static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
{
@@ -218,7 +239,6 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
struct nfs_delegation *delegation = NFS_I(inode)->delegation;
struct nfs_openargs o_arg = {
.fh = NFS_FH(inode),
- .seqid = sp->so_seqid,
.id = sp->so_id,
.open_flags = state->state,
.clientid = server->nfs4_state->cl_clientid,
@@ -245,8 +265,13 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
}
o_arg.u.delegation_type = delegation->type;
}
+ o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+ if (o_arg.seqid == NULL)
+ return -ENOMEM;
status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- nfs4_increment_seqid(status, sp);
+ /* Confirm the sequence as being established */
+ nfs_confirm_seqid(&sp->so_seqid, status);
+ nfs_increment_open_seqid(status, o_arg.seqid);
if (status == 0) {
memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
if (o_res.delegation_type != 0) {
@@ -256,6 +281,7 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
nfs_async_inode_return_delegation(inode, &o_res.stateid);
}
}
+ nfs_free_seqid(o_arg.seqid);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
/* Ensure we update the inode attributes */
NFS_CACHEINV(inode);
@@ -302,23 +328,35 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state
};
int status = 0;
- down(&sp->so_sema);
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
goto out;
if (state->state == 0)
goto out;
- arg.seqid = sp->so_seqid;
+ arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+ status = -ENOMEM;
+ if (arg.seqid == NULL)
+ goto out;
arg.open_flags = state->state;
memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data));
status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- nfs4_increment_seqid(status, sp);
+ nfs_increment_open_seqid(status, arg.seqid);
+ if (status != 0)
+ goto out_free;
+ if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
+ status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
+ sp, &res.stateid, arg.seqid);
+ if (status != 0)
+ goto out_free;
+ }
+ nfs_confirm_seqid(&sp->so_seqid, 0);
if (status >= 0) {
memcpy(state->stateid.data, res.stateid.data,
sizeof(state->stateid.data));
clear_bit(NFS_DELEGATED_STATE, &state->flags);
}
+out_free:
+ nfs_free_seqid(arg.seqid);
out:
- up(&sp->so_sema);
dput(parent);
return status;
}
@@ -345,11 +383,11 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
return err;
}
-static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid)
+static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid)
{
struct nfs_open_confirmargs arg = {
.fh = fh,
- .seqid = sp->so_seqid,
+ .seqid = seqid,
.stateid = *stateid,
};
struct nfs_open_confirmres res;
@@ -362,7 +400,9 @@ static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nf
int status;
status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR);
- nfs4_increment_seqid(status, sp);
+ /* Confirm the sequence as being established */
+ nfs_confirm_seqid(&sp->so_seqid, status);
+ nfs_increment_open_seqid(status, seqid);
if (status >= 0)
memcpy(stateid, &res.stateid, sizeof(*stateid));
return status;
@@ -380,21 +420,41 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, stru
int status;
/* Update sequence id. The caller must serialize! */
- o_arg->seqid = sp->so_seqid;
o_arg->id = sp->so_id;
o_arg->clientid = sp->so_client->cl_clientid;
status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- nfs4_increment_seqid(status, sp);
+ if (status == 0) {
+ /* OPEN on anything except a regular file is disallowed in NFSv4 */
+ switch (o_res->f_attr->mode & S_IFMT) {
+ case S_IFREG:
+ break;
+ case S_IFLNK:
+ status = -ELOOP;
+ break;
+ case S_IFDIR:
+ status = -EISDIR;
+ break;
+ default:
+ status = -ENOTDIR;
+ }
+ }
+
+ nfs_increment_open_seqid(status, o_arg->seqid);
if (status != 0)
goto out;
- update_changeattr(dir, &o_res->cinfo);
+ if (o_arg->open_flags & O_CREAT) {
+ update_changeattr(dir, &o_res->cinfo);
+ nfs_post_op_update_inode(dir, o_res->dir_attr);
+ } else
+ nfs_refresh_inode(dir, o_res->dir_attr);
if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
status = _nfs4_proc_open_confirm(server->client, &o_res->fh,
- sp, &o_res->stateid);
+ sp, &o_res->stateid, o_arg->seqid);
if (status != 0)
goto out;
}
+ nfs_confirm_seqid(&sp->so_seqid, 0);
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
out:
@@ -441,9 +501,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
struct inode *inode = state->inode;
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_delegation *delegation = NFS_I(inode)->delegation;
- struct nfs_fattr f_attr = {
- .valid = 0,
- };
+ struct nfs_fattr f_attr, dir_attr;
struct nfs_openargs o_arg = {
.fh = NFS_FH(dir),
.open_flags = state->state,
@@ -453,6 +511,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
};
struct nfs_openres o_res = {
.f_attr = &f_attr,
+ .dir_attr = &dir_attr,
.server = server,
};
int status = 0;
@@ -465,6 +524,12 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
set_bit(NFS_DELEGATED_STATE, &state->flags);
goto out;
}
+ o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+ status = -ENOMEM;
+ if (o_arg.seqid == NULL)
+ goto out;
+ nfs_fattr_init(&f_attr);
+ nfs_fattr_init(&dir_attr);
status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
if (status != 0)
goto out_nodeleg;
@@ -490,6 +555,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
}
out_nodeleg:
+ nfs_free_seqid(o_arg.seqid);
clear_bit(NFS_DELEGATED_STATE, &state->flags);
out:
dput(parent);
@@ -564,7 +630,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
goto out_err;
}
- down(&sp->so_sema);
state = nfs4_get_open_state(inode, sp);
if (state == NULL)
goto out_err;
@@ -589,7 +654,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
set_bit(NFS_DELEGATED_STATE, &state->flags);
update_open_stateid(state, &delegation->stateid, open_flags);
out_ok:
- up(&sp->so_sema);
nfs4_put_state_owner(sp);
up_read(&nfsi->rwsem);
up_read(&clp->cl_sem);
@@ -600,11 +664,12 @@ out_err:
if (sp != NULL) {
if (state != NULL)
nfs4_put_open_state(state);
- up(&sp->so_sema);
nfs4_put_state_owner(sp);
}
up_read(&nfsi->rwsem);
up_read(&clp->cl_sem);
+ if (err != -EACCES)
+ nfs_inode_return_delegation(inode);
return err;
}
@@ -635,9 +700,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
struct nfs4_client *clp = server->nfs4_state;
struct inode *inode = NULL;
int status;
- struct nfs_fattr f_attr = {
- .valid = 0,
- };
+ struct nfs_fattr f_attr, dir_attr;
struct nfs_openargs o_arg = {
.fh = NFS_FH(dir),
.open_flags = flags,
@@ -648,6 +711,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
};
struct nfs_openres o_res = {
.f_attr = &f_attr,
+ .dir_attr = &dir_attr,
.server = server,
};
@@ -665,8 +729,12 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
} else
o_arg.u.attrs = sattr;
/* Serialization for the sequence id */
- down(&sp->so_sema);
+ o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+ if (o_arg.seqid == NULL)
+ return -ENOMEM;
+ nfs_fattr_init(&f_attr);
+ nfs_fattr_init(&dir_attr);
status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
if (status != 0)
goto out_err;
@@ -681,7 +749,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
update_open_stateid(state, &o_res.stateid, flags);
if (o_res.delegation_type != 0)
nfs_inode_set_delegation(inode, cred, &o_res);
- up(&sp->so_sema);
+ nfs_free_seqid(o_arg.seqid);
nfs4_put_state_owner(sp);
up_read(&clp->cl_sem);
*res = state;
@@ -690,7 +758,7 @@ out_err:
if (sp != NULL) {
if (state != NULL)
nfs4_put_open_state(state);
- up(&sp->so_sema);
+ nfs_free_seqid(o_arg.seqid);
nfs4_put_state_owner(sp);
}
/* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
@@ -718,7 +786,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
* It is actually a sign of a bug on the client or on the server.
*
* If we receive a BAD_SEQID error in the particular case of
- * doing an OPEN, we assume that nfs4_increment_seqid() will
+ * doing an OPEN, we assume that nfs_increment_open_seqid() will
* have unhashed the old state_owner for us, and that we can
* therefore safely retry using a new one. We should still warn
* the user though...
@@ -728,6 +796,16 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
exception.retry = 1;
continue;
}
+ /*
+ * BAD_STATEID on OPEN means that the server cancelled our
+ * state before it received the OPEN_CONFIRM.
+ * Recover by retrying the request as per the discussion
+ * on Page 181 of RFC3530.
+ */
+ if (status == -NFS4ERR_BAD_STATEID) {
+ exception.retry = 1;
+ continue;
+ }
res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
status, &exception));
} while (exception.retry);
@@ -755,7 +833,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
};
int status;
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
if (state != NULL) {
msg.rpc_cred = state->owner->so_cred;
@@ -787,19 +865,30 @@ struct nfs4_closedata {
struct nfs4_state *state;
struct nfs_closeargs arg;
struct nfs_closeres res;
+ struct nfs_fattr fattr;
};
+static void nfs4_free_closedata(struct nfs4_closedata *calldata)
+{
+ struct nfs4_state *state = calldata->state;
+ struct nfs4_state_owner *sp = state->owner;
+
+ nfs4_put_open_state(calldata->state);
+ nfs_free_seqid(calldata->arg.seqid);
+ nfs4_put_state_owner(sp);
+ kfree(calldata);
+}
+
static void nfs4_close_done(struct rpc_task *task)
{
struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
struct nfs4_state *state = calldata->state;
- struct nfs4_state_owner *sp = state->owner;
struct nfs_server *server = NFS_SERVER(calldata->inode);
/* hmm. we are done with the inode, and in the process of freeing
* the state_owner. we keep this around to process errors
*/
- nfs4_increment_seqid(task->tk_status, sp);
+ nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid);
switch (task->tk_status) {
case 0:
memcpy(&state->stateid, &calldata->res.stateid,
@@ -816,25 +905,49 @@ static void nfs4_close_done(struct rpc_task *task)
return;
}
}
+ nfs_refresh_inode(calldata->inode, calldata->res.fattr);
state->state = calldata->arg.open_flags;
- nfs4_put_open_state(state);
- up(&sp->so_sema);
- nfs4_put_state_owner(sp);
- up_read(&server->nfs4_state->cl_sem);
- kfree(calldata);
+ nfs4_free_closedata(calldata);
}
-static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata)
+static void nfs4_close_begin(struct rpc_task *task)
{
+ struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
+ struct nfs4_state *state = calldata->state;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
.rpc_argp = &calldata->arg,
.rpc_resp = &calldata->res,
- .rpc_cred = calldata->state->owner->so_cred,
+ .rpc_cred = state->owner->so_cred,
};
- if (calldata->arg.open_flags != 0)
+ int mode = 0;
+ int status;
+
+ status = nfs_wait_on_sequence(calldata->arg.seqid, task);
+ if (status != 0)
+ return;
+ /* Don't reorder reads */
+ smp_rmb();
+ /* Recalculate the new open mode in case someone reopened the file
+ * while we were waiting in line to be scheduled.
+ */
+ if (state->nreaders != 0)
+ mode |= FMODE_READ;
+ if (state->nwriters != 0)
+ mode |= FMODE_WRITE;
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags))
+ state->state = mode;
+ if (mode == state->state) {
+ nfs4_free_closedata(calldata);
+ task->tk_exit = NULL;
+ rpc_exit(task, 0);
+ return;
+ }
+ nfs_fattr_init(calldata->res.fattr);
+ if (mode != 0)
msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata);
+ calldata->arg.open_flags = mode;
+ rpc_call_setup(task, &msg, 0);
}
/*
@@ -850,40 +963,57 @@ static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *
*/
int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode)
{
+ struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_closedata *calldata;
- int status;
+ int status = -ENOMEM;
- /* Tell caller we're done */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
- state->state = mode;
- return 0;
- }
- calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL);
+ calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
if (calldata == NULL)
- return -ENOMEM;
+ goto out;
calldata->inode = inode;
calldata->state = state;
calldata->arg.fh = NFS_FH(inode);
+ calldata->arg.stateid = &state->stateid;
/* Serialization for the sequence id */
- calldata->arg.seqid = state->owner->so_seqid;
- calldata->arg.open_flags = mode;
- memcpy(&calldata->arg.stateid, &state->stateid,
- sizeof(calldata->arg.stateid));
- status = nfs4_close_call(NFS_SERVER(inode)->client, calldata);
- /*
- * Return -EINPROGRESS on success in order to indicate to the
- * caller that an asynchronous RPC call has been launched, and
- * that it will release the semaphores on completion.
- */
- return (status == 0) ? -EINPROGRESS : status;
+ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+ if (calldata->arg.seqid == NULL)
+ goto out_free_calldata;
+ calldata->arg.bitmask = server->attr_bitmask;
+ calldata->res.fattr = &calldata->fattr;
+ calldata->res.server = server;
+
+ status = nfs4_call_async(server->client, nfs4_close_begin,
+ nfs4_close_done, calldata);
+ if (status == 0)
+ goto out;
+
+ nfs_free_seqid(calldata->arg.seqid);
+out_free_calldata:
+ kfree(calldata);
+out:
+ return status;
}
-struct inode *
+static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
+{
+ struct file *filp;
+
+ filp = lookup_instantiate_filp(nd, dentry, NULL);
+ if (!IS_ERR(filp)) {
+ struct nfs_open_context *ctx;
+ ctx = (struct nfs_open_context *)filp->private_data;
+ ctx->state = state;
+ } else
+ nfs4_close_state(state, nd->intent.open.flags);
+}
+
+struct dentry *
nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
struct iattr attr;
struct rpc_cred *cred;
struct nfs4_state *state;
+ struct dentry *res;
if (nd->flags & LOOKUP_CREATE) {
attr.ia_mode = nd->intent.open.create_mode;
@@ -897,16 +1027,23 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
if (IS_ERR(cred))
- return (struct inode *)cred;
+ return (struct dentry *)cred;
state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
put_rpccred(cred);
- if (IS_ERR(state))
- return (struct inode *)state;
- return state->inode;
+ if (IS_ERR(state)) {
+ if (PTR_ERR(state) == -ENOENT)
+ d_add(dentry, NULL);
+ return (struct dentry *)state;
+ }
+ res = d_add_unique(dentry, state->inode);
+ if (res != NULL)
+ dentry = res;
+ nfs4_intent_set_file(nd, dentry, state);
+ return res;
}
int
-nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags)
+nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
{
struct rpc_cred *cred;
struct nfs4_state *state;
@@ -919,18 +1056,30 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags)
if (IS_ERR(state))
state = nfs4_do_open(dir, dentry, openflags, NULL, cred);
put_rpccred(cred);
- if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0)
- return 1;
- if (IS_ERR(state))
- return 0;
+ if (IS_ERR(state)) {
+ switch (PTR_ERR(state)) {
+ case -EPERM:
+ case -EACCES:
+ case -EDQUOT:
+ case -ENOSPC:
+ case -EROFS:
+ lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
+ return 1;
+ case -ENOENT:
+ if (dentry->d_inode == NULL)
+ return 1;
+ }
+ goto out_drop;
+ }
inode = state->inode;
+ iput(inode);
if (inode == dentry->d_inode) {
- iput(inode);
+ nfs4_intent_set_file(nd, dentry, state);
return 1;
}
- d_drop(dentry);
nfs4_close_state(state, openflags);
- iput(inode);
+out_drop:
+ d_drop(dentry);
return 0;
}
@@ -974,13 +1123,12 @@ static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
- struct nfs_fattr * fattr = info->fattr;
struct nfs4_lookup_root_arg args = {
.bitmask = nfs4_fattr_bitmap,
};
struct nfs4_lookup_res res = {
.server = server,
- .fattr = fattr,
+ .fattr = info->fattr,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -988,7 +1136,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
- fattr->valid = 0;
+ nfs_fattr_init(info->fattr);
return rpc_call_sync(server->client, &msg, 0);
}
@@ -1051,7 +1199,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
q.len = p - q.name;
do {
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = nfs4_handle_exception(server,
rpc_call_sync(server->client, &msg, 0),
&exception);
@@ -1088,7 +1236,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
return rpc_call_sync(server->client, &msg, 0);
}
@@ -1130,7 +1278,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_state *state;
int status;
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
if (IS_ERR(cred))
@@ -1176,7 +1324,7 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
.rpc_resp = &res,
};
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
dprintk("NFS call lookup %s\n", name->name);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
@@ -1325,7 +1473,7 @@ static int _nfs4_proc_read(struct nfs_read_data *rdata)
dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
(long long) rdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(server->client, &msg, flags);
if (!status)
renew_lease(server, timestamp);
@@ -1362,7 +1510,7 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata)
dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
(long long) wdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(server->client, &msg, rpcflags);
dprintk("NFS reply write: %d\n", status);
return status;
@@ -1396,7 +1544,7 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata)
dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
(long long) cdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(server->client, &msg, 0);
dprintk("NFS reply commit: %d\n", status);
return status;
@@ -1431,7 +1579,7 @@ static int nfs4_proc_commit(struct nfs_write_data *cdata)
static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
- int flags)
+ int flags, struct nameidata *nd)
{
struct nfs4_state *state;
struct rpc_cred *cred;
@@ -1453,24 +1601,30 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
struct nfs_fattr fattr;
status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
NFS_FH(state->inode), sattr, state);
- if (status == 0) {
+ if (status == 0)
nfs_setattr_update_inode(state->inode, sattr);
- goto out;
- }
- } else if (flags != 0)
- goto out;
- nfs4_close_state(state, flags);
+ }
+ if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
+ nfs4_intent_set_file(nd, dentry, state);
+ else
+ nfs4_close_state(state, flags);
out:
return status;
}
static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_remove_arg args = {
.fh = NFS_FH(dir),
.name = name,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs_fattr dir_attr;
+ struct nfs4_remove_res res = {
+ .server = server,
+ .dir_attr = &dir_attr,
};
- struct nfs4_change_info res;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
.rpc_argp = &args,
@@ -1478,9 +1632,12 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
};
int status;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (status == 0)
- update_changeattr(dir, &res);
+ nfs_fattr_init(res.dir_attr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ if (status == 0) {
+ update_changeattr(dir, &res.cinfo);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ }
return status;
}
@@ -1498,12 +1655,14 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
struct unlink_desc {
struct nfs4_remove_arg args;
- struct nfs4_change_info res;
+ struct nfs4_remove_res res;
+ struct nfs_fattr dir_attr;
};
static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
struct qstr *name)
{
+ struct nfs_server *server = NFS_SERVER(dir->d_inode);
struct unlink_desc *up;
up = (struct unlink_desc *) kmalloc(sizeof(*up), GFP_KERNEL);
@@ -1512,6 +1671,9 @@ static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
up->args.fh = NFS_FH(dir->d_inode);
up->args.name = name;
+ up->args.bitmask = server->attr_bitmask;
+ up->res.server = server;
+ up->res.dir_attr = &up->dir_attr;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
msg->rpc_argp = &up->args;
@@ -1526,7 +1688,8 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
if (msg->rpc_resp != NULL) {
up = container_of(msg->rpc_resp, struct unlink_desc, res);
- update_changeattr(dir->d_inode, &up->res);
+ update_changeattr(dir->d_inode, &up->res.cinfo);
+ nfs_post_op_update_inode(dir->d_inode, up->res.dir_attr);
kfree(up);
msg->rpc_resp = NULL;
msg->rpc_argp = NULL;
@@ -1537,13 +1700,20 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
struct inode *new_dir, struct qstr *new_name)
{
+ struct nfs_server *server = NFS_SERVER(old_dir);
struct nfs4_rename_arg arg = {
.old_dir = NFS_FH(old_dir),
.new_dir = NFS_FH(new_dir),
.old_name = old_name,
.new_name = new_name,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs_fattr old_fattr, new_fattr;
+ struct nfs4_rename_res res = {
+ .server = server,
+ .old_fattr = &old_fattr,
+ .new_fattr = &new_fattr,
};
- struct nfs4_rename_res res = { };
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
.rpc_argp = &arg,
@@ -1551,11 +1721,15 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
};
int status;
- status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
+ nfs_fattr_init(res.old_fattr);
+ nfs_fattr_init(res.new_fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
if (!status) {
update_changeattr(old_dir, &res.old_cinfo);
+ nfs_post_op_update_inode(old_dir, res.old_fattr);
update_changeattr(new_dir, &res.new_cinfo);
+ nfs_post_op_update_inode(new_dir, res.new_fattr);
}
return status;
}
@@ -1576,22 +1750,34 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
+ struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_link_arg arg = {
.fh = NFS_FH(inode),
.dir_fh = NFS_FH(dir),
.name = name,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs_fattr fattr, dir_attr;
+ struct nfs4_link_res res = {
+ .server = server,
+ .fattr = &fattr,
+ .dir_attr = &dir_attr,
};
- struct nfs4_change_info cinfo = { };
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
.rpc_argp = &arg,
- .rpc_resp = &cinfo,
+ .rpc_resp = &res,
};
int status;
- status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- if (!status)
- update_changeattr(dir, &cinfo);
+ nfs_fattr_init(res.fattr);
+ nfs_fattr_init(res.dir_attr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ if (!status) {
+ update_changeattr(dir, &res.cinfo);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_refresh_inode(inode, res.fattr);
+ }
return status;
}
@@ -1613,6 +1799,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
struct nfs_fattr *fattr)
{
struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_fattr dir_fattr;
struct nfs4_create_arg arg = {
.dir_fh = NFS_FH(dir),
.server = server,
@@ -1625,6 +1812,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
.server = server,
.fh = fhandle,
.fattr = fattr,
+ .dir_fattr = &dir_fattr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
@@ -1636,11 +1824,13 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
if (path->len > NFS4_MAXPATHLEN)
return -ENAMETOOLONG;
arg.u.symlink = path;
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
+ nfs_fattr_init(&dir_fattr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
if (!status)
update_changeattr(dir, &res.dir_cinfo);
+ nfs_post_op_update_inode(dir, res.dir_fattr);
return status;
}
@@ -1664,7 +1854,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_fh fhandle;
- struct nfs_fattr fattr;
+ struct nfs_fattr fattr, dir_fattr;
struct nfs4_create_arg arg = {
.dir_fh = NFS_FH(dir),
.server = server,
@@ -1677,6 +1867,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
.server = server,
.fh = &fhandle,
.fattr = &fattr,
+ .dir_fattr = &dir_fattr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
@@ -1685,11 +1876,13 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
};
int status;
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
+ nfs_fattr_init(&dir_fattr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
if (!status) {
update_changeattr(dir, &res.dir_cinfo);
+ nfs_post_op_update_inode(dir, res.dir_fattr);
status = nfs_instantiate(dentry, &fhandle, &fattr);
}
return status;
@@ -1762,7 +1955,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_fh fh;
- struct nfs_fattr fattr;
+ struct nfs_fattr fattr, dir_fattr;
struct nfs4_create_arg arg = {
.dir_fh = NFS_FH(dir),
.server = server,
@@ -1774,6 +1967,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
.server = server,
.fh = &fh,
.fattr = &fattr,
+ .dir_fattr = &dir_fattr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
@@ -1783,7 +1977,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
int status;
int mode = sattr->ia_mode;
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
+ nfs_fattr_init(&dir_fattr);
BUG_ON(!(sattr->ia_valid & ATTR_MODE));
BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
@@ -1805,6 +2000,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
if (status == 0) {
update_changeattr(dir, &res.dir_cinfo);
+ nfs_post_op_update_inode(dir, res.dir_fattr);
status = nfs_instantiate(dentry, &fh, &fattr);
}
return status;
@@ -1836,7 +2032,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = fsstat,
};
- fsstat->fattr->valid = 0;
+ nfs_fattr_init(fsstat->fattr);
return rpc_call_sync(server->client, &msg, 0);
}
@@ -1883,7 +2079,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
{
- fsinfo->fattr->valid = 0;
+ nfs_fattr_init(fsinfo->fattr);
return nfs4_do_fsinfo(server, fhandle, fsinfo);
}
@@ -1906,7 +2102,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
return 0;
}
- pathconf->fattr->valid = 0;
+ nfs_fattr_init(pathconf->fattr);
return rpc_call_sync(server->client, &msg, 0);
}
@@ -1973,8 +2169,10 @@ nfs4_write_done(struct rpc_task *task)
rpc_restart_call(task);
return;
}
- if (task->tk_status >= 0)
+ if (task->tk_status >= 0) {
renew_lease(NFS_SERVER(inode), data->timestamp);
+ nfs_post_op_update_inode(inode, data->res.fattr);
+ }
/* Call back common NFS writeback processing */
nfs_writeback_done(task);
}
@@ -1990,6 +2188,7 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
.rpc_cred = data->cred,
};
struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
int stable;
int flags;
@@ -2001,6 +2200,8 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
} else
stable = NFS_UNSTABLE;
data->args.stable = stable;
+ data->args.bitmask = server->attr_bitmask;
+ data->res.server = server;
data->timestamp = jiffies;
@@ -2022,6 +2223,8 @@ nfs4_commit_done(struct rpc_task *task)
rpc_restart_call(task);
return;
}
+ if (task->tk_status >= 0)
+ nfs_post_op_update_inode(inode, data->res.fattr);
/* Call back common NFS writeback processing */
nfs_commit_done(task);
}
@@ -2037,8 +2240,12 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
.rpc_cred = data->cred,
};
struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
int flags;
+ data->args.bitmask = server->attr_bitmask;
+ data->res.server = server;
+
/* Set the initial flags for the task. */
flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
@@ -2106,65 +2313,6 @@ nfs4_proc_renew(struct nfs4_client *clp)
return 0;
}
-/*
- * We will need to arrange for the VFS layer to provide an atomic open.
- * Until then, this open method is prone to inefficiency and race conditions
- * due to the lookup, potential create, and open VFS calls from sys_open()
- * placed on the wire.
- */
-static int
-nfs4_proc_file_open(struct inode *inode, struct file *filp)
-{
- struct dentry *dentry = filp->f_dentry;
- struct nfs_open_context *ctx;
- struct nfs4_state *state = NULL;
- struct rpc_cred *cred;
- int status = -ENOMEM;
-
- dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n",
- (int)dentry->d_parent->d_name.len,
- dentry->d_parent->d_name.name,
- (int)dentry->d_name.len, dentry->d_name.name);
-
-
- /* Find our open stateid */
- cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
- if (IS_ERR(cred))
- return PTR_ERR(cred);
- ctx = alloc_nfs_open_context(dentry, cred);
- put_rpccred(cred);
- if (unlikely(ctx == NULL))
- return -ENOMEM;
- status = -EIO; /* ERACE actually */
- state = nfs4_find_state(inode, cred, filp->f_mode);
- if (unlikely(state == NULL))
- goto no_state;
- ctx->state = state;
- nfs4_close_state(state, filp->f_mode);
- ctx->mode = filp->f_mode;
- nfs_file_set_open_context(filp, ctx);
- put_nfs_open_context(ctx);
- if (filp->f_mode & FMODE_WRITE)
- nfs_begin_data_update(inode);
- return 0;
-no_state:
- printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
- put_nfs_open_context(ctx);
- return status;
-}
-
-/*
- * Release our state
- */
-static int
-nfs4_proc_file_release(struct inode *inode, struct file *filp)
-{
- if (filp->f_mode & FMODE_WRITE)
- nfs_end_data_update(inode);
- nfs_file_clear_open_context(filp);
- return 0;
-}
-
static inline int nfs4_server_supports_acls(struct nfs_server *server)
{
return (server->caps & NFS_CAP_ACLS)
@@ -2285,7 +2433,7 @@ static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size
return -ENOMEM;
args.acl_pages[0] = localpage;
args.acl_pgbase = 0;
- args.acl_len = PAGE_SIZE;
+ resp_len = args.acl_len = PAGE_SIZE;
} else {
resp_buf = buf;
buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
@@ -2345,6 +2493,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
if (!nfs4_server_supports_acls(server))
return -EOPNOTSUPP;
+ nfs_inode_return_delegation(inode);
buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
if (ret == 0)
@@ -2353,7 +2502,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
}
static int
-nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
{
struct nfs4_client *clp = server->nfs4_state;
@@ -2431,7 +2580,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
{
struct nfs4_client *clp = server->nfs4_state;
int ret = errorcode;
@@ -2632,7 +2781,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
down_read(&clp->cl_sem);
nlo.clientid = clp->cl_clientid;
- down(&state->lock_sema);
status = nfs4_set_lock_state(state, request);
if (status != 0)
goto out;
@@ -2659,7 +2807,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
status = 0;
}
out:
- up(&state->lock_sema);
up_read(&clp->cl_sem);
return status;
}
@@ -2696,79 +2843,149 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
return res;
}
-static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+struct nfs4_unlockdata {
+ struct nfs_lockargs arg;
+ struct nfs_locku_opargs luargs;
+ struct nfs_lockres res;
+ struct nfs4_lock_state *lsp;
+ struct nfs_open_context *ctx;
+ atomic_t refcount;
+ struct completion completion;
+};
+
+static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata)
{
- struct inode *inode = state->inode;
- struct nfs_server *server = NFS_SERVER(inode);
- struct nfs4_client *clp = server->nfs4_state;
- struct nfs_lockargs arg = {
- .fh = NFS_FH(inode),
- .type = nfs4_lck_type(cmd, request),
- .offset = request->fl_start,
- .length = nfs4_lck_length(request),
- };
- struct nfs_lockres res = {
- .server = server,
- };
+ if (atomic_dec_and_test(&calldata->refcount)) {
+ nfs_free_seqid(calldata->luargs.seqid);
+ nfs4_put_lock_state(calldata->lsp);
+ put_nfs_open_context(calldata->ctx);
+ kfree(calldata);
+ }
+}
+
+static void nfs4_locku_complete(struct nfs4_unlockdata *calldata)
+{
+ complete(&calldata->completion);
+ nfs4_locku_release_calldata(calldata);
+}
+
+static void nfs4_locku_done(struct rpc_task *task)
+{
+ struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
+
+ nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
+ switch (task->tk_status) {
+ case 0:
+ memcpy(calldata->lsp->ls_stateid.data,
+ calldata->res.u.stateid.data,
+ sizeof(calldata->lsp->ls_stateid.data));
+ break;
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_schedule_state_recovery(calldata->res.server->nfs4_state);
+ break;
+ default:
+ if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
+ rpc_restart_call(task);
+ return;
+ }
+ }
+ nfs4_locku_complete(calldata);
+}
+
+static void nfs4_locku_begin(struct rpc_task *task)
+{
+ struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- .rpc_cred = state->owner->so_cred,
+ .rpc_argp = &calldata->arg,
+ .rpc_resp = &calldata->res,
+ .rpc_cred = calldata->lsp->ls_state->owner->so_cred,
};
+ int status;
+
+ status = nfs_wait_on_sequence(calldata->luargs.seqid, task);
+ if (status != 0)
+ return;
+ if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
+ nfs4_locku_complete(calldata);
+ task->tk_exit = NULL;
+ rpc_exit(task, 0);
+ return;
+ }
+ rpc_call_setup(task, &msg, 0);
+}
+
+static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_unlockdata *calldata;
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_lock_state *lsp;
- struct nfs_locku_opargs luargs;
int status;
-
- down_read(&clp->cl_sem);
- down(&state->lock_sema);
+
status = nfs4_set_lock_state(state, request);
if (status != 0)
- goto out;
+ return status;
lsp = request->fl_u.nfs4_fl.owner;
/* We might have lost the locks! */
if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
- goto out;
- luargs.seqid = lsp->ls_seqid;
- memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
- arg.u.locku = &luargs;
- status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- nfs4_increment_lock_seqid(status, lsp);
-
- if (status == 0)
- memcpy(&lsp->ls_stateid, &res.u.stateid,
- sizeof(lsp->ls_stateid));
-out:
- up(&state->lock_sema);
+ return 0;
+ calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
+ if (calldata == NULL)
+ return -ENOMEM;
+ calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+ if (calldata->luargs.seqid == NULL) {
+ kfree(calldata);
+ return -ENOMEM;
+ }
+ calldata->luargs.stateid = &lsp->ls_stateid;
+ calldata->arg.fh = NFS_FH(inode);
+ calldata->arg.type = nfs4_lck_type(cmd, request);
+ calldata->arg.offset = request->fl_start;
+ calldata->arg.length = nfs4_lck_length(request);
+ calldata->arg.u.locku = &calldata->luargs;
+ calldata->res.server = server;
+ calldata->lsp = lsp;
+ atomic_inc(&lsp->ls_count);
+
+ /* Ensure we don't close file until we're done freeing locks! */
+ calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
+
+ atomic_set(&calldata->refcount, 2);
+ init_completion(&calldata->completion);
+
+ status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
+ nfs4_locku_done, calldata);
if (status == 0)
- do_vfs_lock(request->fl_file, request);
- up_read(&clp->cl_sem);
+ wait_for_completion_interruptible(&calldata->completion);
+ do_vfs_lock(request->fl_file, request);
+ nfs4_locku_release_calldata(calldata);
return status;
}
-static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
-{
- struct nfs4_exception exception = { };
- int err;
-
- do {
- err = nfs4_handle_exception(NFS_SERVER(state->inode),
- _nfs4_proc_unlck(state, cmd, request),
- &exception);
- } while (exception.retry);
- return err;
-}
-
static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim)
{
struct inode *inode = state->inode;
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+ struct nfs_lock_opargs largs = {
+ .lock_stateid = &lsp->ls_stateid,
+ .open_stateid = &state->stateid,
+ .lock_owner = {
+ .clientid = server->nfs4_state->cl_clientid,
+ .id = lsp->ls_id,
+ },
+ .reclaim = reclaim,
+ };
struct nfs_lockargs arg = {
.fh = NFS_FH(inode),
.type = nfs4_lck_type(cmd, request),
.offset = request->fl_start,
.length = nfs4_lck_length(request),
+ .u = {
+ .lock = &largs,
+ },
};
struct nfs_lockres res = {
.server = server,
@@ -2779,53 +2996,39 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
.rpc_resp = &res,
.rpc_cred = state->owner->so_cred,
};
- struct nfs_lock_opargs largs = {
- .reclaim = reclaim,
- .new_lock_owner = 0,
- };
- int status;
+ int status = -ENOMEM;
- if (!(lsp->ls_flags & NFS_LOCK_INITIALIZED)) {
+ largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+ if (largs.lock_seqid == NULL)
+ return -ENOMEM;
+ if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
struct nfs4_state_owner *owner = state->owner;
- struct nfs_open_to_lock otl = {
- .lock_owner = {
- .clientid = server->nfs4_state->cl_clientid,
- },
- };
-
- otl.lock_seqid = lsp->ls_seqid;
- otl.lock_owner.id = lsp->ls_id;
- memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid));
- largs.u.open_lock = &otl;
+
+ largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
+ if (largs.open_seqid == NULL)
+ goto out;
largs.new_lock_owner = 1;
- arg.u.lock = &largs;
- down(&owner->so_sema);
- otl.open_seqid = owner->so_seqid;
status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- /* increment open_owner seqid on success, and
- * seqid mutating errors */
- nfs4_increment_seqid(status, owner);
- up(&owner->so_sema);
- if (status == 0) {
- lsp->ls_flags |= NFS_LOCK_INITIALIZED;
- lsp->ls_seqid++;
+ /* increment open seqid on success, and seqid mutating errors */
+ if (largs.new_lock_owner != 0) {
+ nfs_increment_open_seqid(status, largs.open_seqid);
+ if (status == 0)
+ nfs_confirm_seqid(&lsp->ls_seqid, 0);
}
- } else {
- struct nfs_exist_lock el = {
- .seqid = lsp->ls_seqid,
- };
- memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid));
- largs.u.exist_lock = &el;
- arg.u.lock = &largs;
+ nfs_free_seqid(largs.open_seqid);
+ } else
status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
- /* increment seqid on success, and * seqid mutating errors*/
- nfs4_increment_lock_seqid(status, lsp);
- }
+ /* increment lock seqid on success, and seqid mutating errors*/
+ nfs_increment_lock_seqid(status, largs.lock_seqid);
/* save the returned stateid. */
- if (status == 0)
- memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid));
- else if (status == -NFS4ERR_DENIED)
+ if (status == 0) {
+ memcpy(lsp->ls_stateid.data, res.u.stateid.data,
+ sizeof(lsp->ls_stateid.data));
+ lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+ } else if (status == -NFS4ERR_DENIED)
status = -EAGAIN;
+out:
+ nfs_free_seqid(largs.lock_seqid);
return status;
}
@@ -2865,11 +3068,9 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
int status;
down_read(&clp->cl_sem);
- down(&state->lock_sema);
status = nfs4_set_lock_state(state, request);
if (status == 0)
status = _nfs4_do_setlk(state, cmd, request, 0);
- up(&state->lock_sema);
if (status == 0) {
/* Note: we always want to sleep here! */
request->fl_flags |= FL_SLEEP;
@@ -3024,8 +3225,8 @@ struct nfs_rpc_ops nfs_v4_clientops = {
.read_setup = nfs4_proc_read_setup,
.write_setup = nfs4_proc_write_setup,
.commit_setup = nfs4_proc_commit_setup,
- .file_open = nfs4_proc_file_open,
- .file_release = nfs4_proc_file_release,
+ .file_open = nfs_open,
+ .file_release = nfs_release,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index afe587d82f1..2d5a6a2b9de 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -264,13 +264,16 @@ nfs4_alloc_state_owner(void)
{
struct nfs4_state_owner *sp;
- sp = kmalloc(sizeof(*sp),GFP_KERNEL);
+ sp = kzalloc(sizeof(*sp),GFP_KERNEL);
if (!sp)
return NULL;
- init_MUTEX(&sp->so_sema);
- sp->so_seqid = 0; /* arbitrary */
+ spin_lock_init(&sp->so_lock);
INIT_LIST_HEAD(&sp->so_states);
INIT_LIST_HEAD(&sp->so_delegations);
+ rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
+ sp->so_seqid.sequence = &sp->so_sequence;
+ spin_lock_init(&sp->so_sequence.lock);
+ INIT_LIST_HEAD(&sp->so_sequence.list);
atomic_set(&sp->so_count, 1);
return sp;
}
@@ -359,7 +362,6 @@ nfs4_alloc_open_state(void)
memset(state->stateid.data, 0, sizeof(state->stateid.data));
atomic_set(&state->count, 1);
INIT_LIST_HEAD(&state->lock_states);
- init_MUTEX(&state->lock_sema);
spin_lock_init(&state->state_lock);
return state;
}
@@ -437,21 +439,23 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
if (state)
goto out;
new = nfs4_alloc_open_state();
+ spin_lock(&owner->so_lock);
spin_lock(&inode->i_lock);
state = __nfs4_find_state_byowner(inode, owner);
if (state == NULL && new != NULL) {
state = new;
- /* Caller *must* be holding owner->so_sem */
- /* Note: The reclaim code dictates that we add stateless
- * and read-only stateids to the end of the list */
- list_add_tail(&state->open_states, &owner->so_states);
state->owner = owner;
atomic_inc(&owner->so_count);
list_add(&state->inode_states, &nfsi->open_states);
state->inode = igrab(inode);
spin_unlock(&inode->i_lock);
+ /* Note: The reclaim code dictates that we add stateless
+ * and read-only stateids to the end of the list */
+ list_add_tail(&state->open_states, &owner->so_states);
+ spin_unlock(&owner->so_lock);
} else {
spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
if (new)
nfs4_free_open_state(new);
}
@@ -461,19 +465,21 @@ out:
/*
* Beware! Caller must be holding exactly one
- * reference to clp->cl_sem and owner->so_sema!
+ * reference to clp->cl_sem!
*/
void nfs4_put_open_state(struct nfs4_state *state)
{
struct inode *inode = state->inode;
struct nfs4_state_owner *owner = state->owner;
- if (!atomic_dec_and_lock(&state->count, &inode->i_lock))
+ if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
return;
+ spin_lock(&inode->i_lock);
if (!list_empty(&state->inode_states))
list_del(&state->inode_states);
- spin_unlock(&inode->i_lock);
list_del(&state->open_states);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
iput(inode);
BUG_ON (state->state != 0);
nfs4_free_open_state(state);
@@ -481,20 +487,17 @@ void nfs4_put_open_state(struct nfs4_state *state)
}
/*
- * Beware! Caller must be holding no references to clp->cl_sem!
- * of owner->so_sema!
+ * Close the current file.
*/
void nfs4_close_state(struct nfs4_state *state, mode_t mode)
{
struct inode *inode = state->inode;
struct nfs4_state_owner *owner = state->owner;
- struct nfs4_client *clp = owner->so_client;
int newstate;
atomic_inc(&owner->so_count);
- down_read(&clp->cl_sem);
- down(&owner->so_sema);
/* Protect against nfs4_find_state() */
+ spin_lock(&owner->so_lock);
spin_lock(&inode->i_lock);
if (mode & FMODE_READ)
state->nreaders--;
@@ -507,6 +510,7 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
list_move_tail(&state->open_states, &owner->so_states);
}
spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
newstate = 0;
if (state->state != 0) {
if (state->nreaders)
@@ -515,14 +519,16 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
newstate |= FMODE_WRITE;
if (state->state == newstate)
goto out;
- if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS)
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ state->state = newstate;
+ goto out;
+ }
+ if (nfs4_do_close(inode, state, newstate) == 0)
return;
}
out:
nfs4_put_open_state(state);
- up(&owner->so_sema);
nfs4_put_state_owner(owner);
- up_read(&clp->cl_sem);
}
/*
@@ -546,19 +552,16 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
*
- * The caller must be holding state->lock_sema
*/
static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *lsp;
struct nfs4_client *clp = state->owner->so_client;
- lsp = kmalloc(sizeof(*lsp), GFP_KERNEL);
+ lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
if (lsp == NULL)
return NULL;
- lsp->ls_flags = 0;
- lsp->ls_seqid = 0; /* arbitrary */
- memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
+ lsp->ls_seqid.sequence = &state->owner->so_sequence;
atomic_set(&lsp->ls_count, 1);
lsp->ls_owner = fl_owner;
spin_lock(&clp->cl_lock);
@@ -572,7 +575,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
*
- * The caller must be holding state->lock_sema and clp->cl_sem
+ * The caller must be holding clp->cl_sem
*/
static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
@@ -605,7 +608,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
* Release reference to lock_state, and free it if we see that
* it is no longer in use
*/
-static void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
{
struct nfs4_state *state;
@@ -673,29 +676,94 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
nfs4_put_lock_state(lsp);
}
-/*
-* Called with state->lock_sema and clp->cl_sem held.
-*/
-void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
{
- if (status == NFS_OK || seqid_mutating_err(-status))
- lsp->ls_seqid++;
+ struct nfs_seqid *new;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (new != NULL) {
+ new->sequence = counter;
+ INIT_LIST_HEAD(&new->list);
+ }
+ return new;
+}
+
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+ struct rpc_sequence *sequence = seqid->sequence->sequence;
+
+ if (!list_empty(&seqid->list)) {
+ spin_lock(&sequence->lock);
+ list_del(&seqid->list);
+ spin_unlock(&sequence->lock);
+ }
+ rpc_wake_up_next(&sequence->wait);
+ kfree(seqid);
}
/*
-* Called with sp->so_sema and clp->cl_sem held.
-*
-* Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
-* failed with a seqid incrementing error -
-* see comments nfs_fs.h:seqid_mutating_error()
-*/
-void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp)
-{
- if (status == NFS_OK || seqid_mutating_err(-status))
- sp->so_seqid++;
- /* If the server returns BAD_SEQID, unhash state_owner here */
- if (status == -NFS4ERR_BAD_SEQID)
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+static inline void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
+{
+ switch (status) {
+ case 0:
+ break;
+ case -NFS4ERR_BAD_SEQID:
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_BADXDR:
+ case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_NOFILEHANDLE:
+ /* Non-seqid mutating errors */
+ return;
+ };
+ /*
+ * Note: no locking needed as we are guaranteed to be first
+ * on the sequence list
+ */
+ seqid->sequence->counter++;
+}
+
+void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
+{
+ if (status == -NFS4ERR_BAD_SEQID) {
+ struct nfs4_state_owner *sp = container_of(seqid->sequence,
+ struct nfs4_state_owner, so_seqid);
nfs4_drop_state_owner(sp);
+ }
+ return nfs_increment_seqid(status, seqid);
+}
+
+/*
+ * Increment the seqid if the LOCK/LOCKU succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
+{
+ return nfs_increment_seqid(status, seqid);
+}
+
+int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
+{
+ struct rpc_sequence *sequence = seqid->sequence->sequence;
+ int status = 0;
+
+ if (sequence->list.next == &seqid->list)
+ goto out;
+ spin_lock(&sequence->lock);
+ if (!list_empty(&sequence->list)) {
+ rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+ status = -EAGAIN;
+ } else
+ list_add(&seqid->list, &sequence->list);
+ spin_unlock(&sequence->lock);
+out:
+ return status;
}
static int reclaimer(void *);
@@ -791,8 +859,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
if (state->state == 0)
continue;
status = ops->recover_open(sp, state);
- list_for_each_entry(lock, &state->lock_states, ls_locks)
- lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
if (status >= 0) {
status = nfs4_reclaim_locks(ops, state);
if (status < 0)
@@ -831,6 +897,28 @@ out_err:
return status;
}
+static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
+{
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state;
+ struct nfs4_lock_state *lock;
+
+ /* Reset all sequence ids to zero */
+ list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+ sp->so_seqid.counter = 0;
+ sp->so_seqid.flags = 0;
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ lock->ls_seqid.counter = 0;
+ lock->ls_seqid.flags = 0;
+ lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+ }
+ }
+ spin_unlock(&sp->so_lock);
+ }
+}
+
static int reclaimer(void *ptr)
{
struct reclaimer_args *args = (struct reclaimer_args *)ptr;
@@ -864,6 +952,7 @@ restart_loop:
default:
ops = &nfs4_network_partition_recovery_ops;
};
+ nfs4_state_mark_reclaim(clp);
status = __nfs4_init_client(clp);
if (status)
goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6c564ef9489..fbbace8a30c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -95,6 +95,8 @@ static int nfs_stat_to_errno(int);
#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
#define encode_savefh_maxsz (op_encode_hdr_maxsz)
#define decode_savefh_maxsz (op_decode_hdr_maxsz)
+#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
+#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2)
#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11)
#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
@@ -157,16 +159,20 @@ static int nfs_stat_to_errno(int);
op_decode_hdr_maxsz + 2)
#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- op_encode_hdr_maxsz + 8)
+ op_encode_hdr_maxsz + 8 + \
+ encode_getattr_maxsz)
#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- op_decode_hdr_maxsz + 4)
+ op_decode_hdr_maxsz + 4 + \
+ decode_getattr_maxsz)
#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- op_encode_hdr_maxsz + 3)
+ op_encode_hdr_maxsz + 3 + \
+ encode_getattr_maxsz)
#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- op_decode_hdr_maxsz + 2)
+ op_decode_hdr_maxsz + 2 + \
+ decode_getattr_maxsz)
#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
op_encode_hdr_maxsz + \
@@ -196,17 +202,21 @@ static int nfs_stat_to_errno(int);
#define NFS4_enc_open_downgrade_sz \
(compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- op_encode_hdr_maxsz + 7)
+ op_encode_hdr_maxsz + 7 + \
+ encode_getattr_maxsz)
#define NFS4_dec_open_downgrade_sz \
(compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- op_decode_hdr_maxsz + 4)
+ op_decode_hdr_maxsz + 4 + \
+ decode_getattr_maxsz)
#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- op_encode_hdr_maxsz + 5)
+ op_encode_hdr_maxsz + 5 + \
+ encode_getattr_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- op_decode_hdr_maxsz + 4)
+ op_decode_hdr_maxsz + 4 + \
+ decode_getattr_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
op_encode_hdr_maxsz + 4 + \
@@ -300,30 +310,44 @@ static int nfs_stat_to_errno(int);
decode_getfh_maxsz)
#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- encode_remove_maxsz)
+ encode_remove_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- op_decode_hdr_maxsz + 5)
+ op_decode_hdr_maxsz + 5 + \
+ decode_getattr_maxsz)
#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
- encode_rename_maxsz)
+ encode_rename_maxsz + \
+ encode_getattr_maxsz + \
+ encode_restorefh_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
- decode_rename_maxsz)
+ decode_rename_maxsz + \
+ decode_getattr_maxsz + \
+ decode_restorefh_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
- encode_link_maxsz)
+ encode_link_maxsz + \
+ decode_getattr_maxsz + \
+ encode_restorefh_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
- decode_link_maxsz)
+ decode_link_maxsz + \
+ decode_getattr_maxsz + \
+ decode_restorefh_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_symlink_maxsz + \
@@ -336,14 +360,20 @@ static int nfs_stat_to_errno(int);
decode_getfh_maxsz)
#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
encode_create_maxsz + \
+ encode_getfh_maxsz + \
encode_getattr_maxsz + \
- encode_getfh_maxsz)
+ encode_restorefh_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
decode_create_maxsz + \
+ decode_getfh_maxsz + \
decode_getattr_maxsz + \
- decode_getfh_maxsz)
+ decode_restorefh_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_getattr_maxsz)
@@ -602,10 +632,10 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
{
uint32_t *p;
- RESERVE_SPACE(8+sizeof(arg->stateid.data));
+ RESERVE_SPACE(8+sizeof(arg->stateid->data));
WRITE32(OP_CLOSE);
- WRITE32(arg->seqid);
- WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+ WRITE32(arg->seqid->sequence->counter);
+ WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
return 0;
}
@@ -729,22 +759,18 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
WRITE64(arg->length);
WRITE32(opargs->new_lock_owner);
if (opargs->new_lock_owner){
- struct nfs_open_to_lock *ol = opargs->u.open_lock;
-
RESERVE_SPACE(40);
- WRITE32(ol->open_seqid);
- WRITEMEM(&ol->open_stateid, sizeof(ol->open_stateid));
- WRITE32(ol->lock_seqid);
- WRITE64(ol->lock_owner.clientid);
+ WRITE32(opargs->open_seqid->sequence->counter);
+ WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data));
+ WRITE32(opargs->lock_seqid->sequence->counter);
+ WRITE64(opargs->lock_owner.clientid);
WRITE32(4);
- WRITE32(ol->lock_owner.id);
+ WRITE32(opargs->lock_owner.id);
}
else {
- struct nfs_exist_lock *el = opargs->u.exist_lock;
-
RESERVE_SPACE(20);
- WRITEMEM(&el->stateid, sizeof(el->stateid));
- WRITE32(el->seqid);
+ WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data));
+ WRITE32(opargs->lock_seqid->sequence->counter);
}
return 0;
@@ -775,8 +801,8 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
RESERVE_SPACE(44);
WRITE32(OP_LOCKU);
WRITE32(arg->type);
- WRITE32(opargs->seqid);
- WRITEMEM(&opargs->stateid, sizeof(opargs->stateid));
+ WRITE32(opargs->seqid->sequence->counter);
+ WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data));
WRITE64(arg->offset);
WRITE64(arg->length);
@@ -826,7 +852,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
*/
RESERVE_SPACE(8);
WRITE32(OP_OPEN);
- WRITE32(arg->seqid);
+ WRITE32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->open_flags);
RESERVE_SPACE(16);
WRITE64(arg->clientid);
@@ -941,7 +967,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
RESERVE_SPACE(8+sizeof(arg->stateid.data));
WRITE32(OP_OPEN_CONFIRM);
WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
- WRITE32(arg->seqid);
+ WRITE32(arg->seqid->sequence->counter);
return 0;
}
@@ -950,10 +976,10 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
{
uint32_t *p;
- RESERVE_SPACE(8+sizeof(arg->stateid.data));
+ RESERVE_SPACE(8+sizeof(arg->stateid->data));
WRITE32(OP_OPEN_DOWNGRADE);
- WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
- WRITE32(arg->seqid);
+ WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
+ WRITE32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->open_flags);
return 0;
}
@@ -1117,6 +1143,17 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client
}
static int
+encode_restorefh(struct xdr_stream *xdr)
+{
+ uint32_t *p;
+
+ RESERVE_SPACE(4);
+ WRITE32(OP_RESTOREFH);
+
+ return 0;
+}
+
+static int
encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
{
uint32_t *p;
@@ -1296,14 +1333,18 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 3,
};
int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->fh)) == 0)
- status = encode_remove(&xdr, args->name);
+ if ((status = encode_putfh(&xdr, args->fh)) != 0)
+ goto out;
+ if ((status = encode_remove(&xdr, args->name)) != 0)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
+out:
return status;
}
@@ -1314,7 +1355,7 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 4,
+ .nops = 7,
};
int status;
@@ -1326,7 +1367,13 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n
goto out;
if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
goto out;
- status = encode_rename(&xdr, args->old_name, args->new_name);
+ if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
+ goto out;
+ if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+ goto out;
+ if ((status = encode_restorefh(&xdr)) != 0)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1338,7 +1385,7 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 4,
+ .nops = 7,
};
int status;
@@ -1350,7 +1397,13 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs
goto out;
if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
goto out;
- status = encode_link(&xdr, args->name);
+ if ((status = encode_link(&xdr, args->name)) != 0)
+ goto out;
+ if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+ goto out;
+ if ((status = encode_restorefh(&xdr)) != 0)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1362,7 +1415,7 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 4,
+ .nops = 7,
};
int status;
@@ -1370,10 +1423,16 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n
encode_compound_hdr(&xdr, &hdr);
if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
goto out;
+ if ((status = encode_savefh(&xdr)) != 0)
+ goto out;
if ((status = encode_create(&xdr, args)) != 0)
goto out;
if ((status = encode_getfh(&xdr)) != 0)
goto out;
+ if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+ goto out;
+ if ((status = encode_restorefh(&xdr)) != 0)
+ goto out;
status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
@@ -1412,7 +1471,7 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 3,
};
int status;
@@ -1422,6 +1481,9 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos
if(status)
goto out;
status = encode_close(&xdr, args);
+ if (status != 0)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1433,15 +1495,21 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 4,
+ .nops = 7,
};
int status;
+ status = nfs_wait_on_sequence(args->seqid, req->rq_task);
+ if (status != 0)
+ goto out;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
status = encode_putfh(&xdr, args->fh);
if (status)
goto out;
+ status = encode_savefh(&xdr);
+ if (status)
+ goto out;
status = encode_open(&xdr, args);
if (status)
goto out;
@@ -1449,6 +1517,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
if (status)
goto out;
status = encode_getfattr(&xdr, args->bitmask);
+ if (status)
+ goto out;
+ status = encode_restorefh(&xdr);
+ if (status)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1464,6 +1538,9 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n
};
int status;
+ status = nfs_wait_on_sequence(args->seqid, req->rq_task);
+ if (status != 0)
+ goto out;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
status = encode_putfh(&xdr, args->fh);
@@ -1485,6 +1562,9 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf
};
int status;
+ status = nfs_wait_on_sequence(args->seqid, req->rq_task);
+ if (status != 0)
+ goto out;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
status = encode_putfh(&xdr, args->fh);
@@ -1502,7 +1582,7 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 3,
};
int status;
@@ -1512,6 +1592,9 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct
if (status)
goto out;
status = encode_open_downgrade(&xdr, args);
+ if (status != 0)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1525,8 +1608,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_locka
struct compound_hdr hdr = {
.nops = 2,
};
+ struct nfs_lock_opargs *opargs = args->u.lock;
int status;
+ status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task);
+ if (status != 0)
+ goto out;
+ /* Do we need to do an open_to_lock_owner? */
+ if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+ opargs->new_lock_owner = 0;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
status = encode_putfh(&xdr, args->fh);
@@ -1713,7 +1803,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 3,
};
int status;
@@ -1723,6 +1813,9 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ
if (status)
goto out;
status = encode_write(&xdr, args);
+ if (status)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -1734,7 +1827,7 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 3,
};
int status;
@@ -1744,6 +1837,9 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri
if (status)
goto out;
status = encode_commit(&xdr, args);
+ if (status)
+ goto out;
+ status = encode_getfattr(&xdr, args->bitmask);
out:
return status;
}
@@ -2670,8 +2766,7 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
- if (status != 0)
- printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
+ dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
return status;
}
@@ -2704,8 +2799,7 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
- if (status != 0)
- printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
+ dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
return status;
}
@@ -2730,8 +2824,7 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
- if (status != 0)
- printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
+ dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
return status;
}
@@ -2787,13 +2880,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
goto xdr_error;
if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
goto xdr_error;
- if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) {
+ if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
- fattr->timestamp = jiffies;
- }
xdr_error:
- if (status != 0)
- printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
+ dprintk("%s: xdr returned %d\n", __FUNCTION__, -status);
return status;
}
@@ -2826,8 +2916,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
- if (status != 0)
- printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
+ dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
return status;
}
@@ -2890,8 +2979,8 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res)
status = decode_op_hdr(xdr, OP_LOCK);
if (status == 0) {
- READ_BUF(sizeof(nfs4_stateid));
- COPYMEM(&res->u.stateid, sizeof(res->u.stateid));
+ READ_BUF(sizeof(res->u.stateid.data));
+ COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
} else if (status == -NFS4ERR_DENIED)
return decode_lock_denied(xdr, &res->u.denied);
return status;
@@ -2913,8 +3002,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res)
status = decode_op_hdr(xdr, OP_LOCKU);
if (status == 0) {
- READ_BUF(sizeof(nfs4_stateid));
- COPYMEM(&res->u.stateid, sizeof(res->u.stateid));
+ READ_BUF(sizeof(res->u.stateid.data));
+ COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
}
return status;
}
@@ -2994,7 +3083,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
p += bmlen;
return decode_delegation(xdr, res);
xdr_error:
- printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__);
+ dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
return -EIO;
}
@@ -3208,6 +3297,12 @@ static int decode_renew(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_RENEW);
}
+static int
+decode_restorefh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RESTOREFH);
+}
+
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
size_t *acl_len)
{
@@ -3243,7 +3338,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
if (attrlen <= *acl_len)
xdr_read_pages(xdr, attrlen);
*acl_len = attrlen;
- }
+ } else
+ status = -EOPNOTSUPP;
out:
return status;
@@ -3352,6 +3448,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, stru
if (status)
goto out;
status = decode_open_downgrade(&xdr, res);
+ if (status != 0)
+ goto out;
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
@@ -3424,7 +3523,7 @@ out:
/*
* Decode REMOVE response
*/
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3433,8 +3532,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
- if ((status = decode_putfh(&xdr)) == 0)
- status = decode_remove(&xdr, cinfo);
+ if ((status = decode_putfh(&xdr)) != 0)
+ goto out;
+ if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
+ goto out;
+ decode_getfattr(&xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -3457,7 +3559,14 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
goto out;
if ((status = decode_putfh(&xdr)) != 0)
goto out;
- status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo);
+ if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
+ goto out;
+ /* Current FH is target directory */
+ if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+ goto out;
+ if ((status = decode_restorefh(&xdr)) != 0)
+ goto out;
+ decode_getfattr(&xdr, res->old_fattr, res->server);
out:
return status;
}
@@ -3465,7 +3574,7 @@ out:
/*
* Decode LINK response
*/
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3480,7 +3589,17 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ch
goto out;
if ((status = decode_putfh(&xdr)) != 0)
goto out;
- status = decode_link(&xdr, cinfo);
+ if ((status = decode_link(&xdr, &res->cinfo)) != 0)
+ goto out;
+ /*
+ * Note order: OP_LINK leaves the directory as the current
+ * filehandle.
+ */
+ if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+ goto out;
+ if ((status = decode_restorefh(&xdr)) != 0)
+ goto out;
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
@@ -3499,13 +3618,17 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
goto out;
if ((status = decode_putfh(&xdr)) != 0)
goto out;
+ if ((status = decode_savefh(&xdr)) != 0)
+ goto out;
if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
goto out;
if ((status = decode_getfh(&xdr, res->fh)) != 0)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
- if (status == NFS4ERR_DELAY)
- status = 0;
+ if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+ goto out;
+ if ((status = decode_restorefh(&xdr)) != 0)
+ goto out;
+ decode_getfattr(&xdr, res->dir_fattr, res->server);
out:
return status;
}
@@ -3623,6 +3746,15 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_cl
if (status)
goto out;
status = decode_close(&xdr, res);
+ if (status != 0)
+ goto out;
+ /*
+ * Note: Server may do delete on close for this file
+ * in which case the getattr call will fail with
+ * an ESTALE error. Shouldn't be a problem,
+ * though, since fattr->valid will remain unset.
+ */
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
@@ -3643,15 +3775,20 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_ope
status = decode_putfh(&xdr);
if (status)
goto out;
+ status = decode_savefh(&xdr);
+ if (status)
+ goto out;
status = decode_open(&xdr, res);
if (status)
goto out;
status = decode_getfh(&xdr, &res->fh);
if (status)
goto out;
- status = decode_getfattr(&xdr, res->f_attr, res->server);
- if (status == NFS4ERR_DELAY)
- status = 0;
+ if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+ goto out;
+ if ((status = decode_restorefh(&xdr)) != 0)
+ goto out;
+ decode_getfattr(&xdr, res->dir_attr, res->server);
out:
return status;
}
@@ -3869,6 +4006,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_wr
if (status)
goto out;
status = decode_write(&xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(&xdr, res->fattr, res->server);
if (!status)
status = res->count;
out:
@@ -3892,6 +4032,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_w
if (status)
goto out;
status = decode_commit(&xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index be23c3fb926..a48a003242c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -61,7 +61,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("%s: call getattr\n", __FUNCTION__);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0);
dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
if (status)
@@ -93,7 +93,7 @@ nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call getattr\n");
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(server->client, NFSPROC_GETATTR,
fhandle, fattr, 0);
dprintk("NFS reply getattr: %d\n", status);
@@ -112,7 +112,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;
dprintk("NFS call setattr\n");
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
@@ -136,7 +136,7 @@ nfs_proc_lookup(struct inode *dir, struct qstr *name,
int status;
dprintk("NFS call lookup %s\n", name->name);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0);
dprintk("NFS reply lookup: %d\n", status);
return status;
@@ -174,7 +174,7 @@ static int nfs_proc_read(struct nfs_read_data *rdata)
dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
(long long) rdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
if (status >= 0) {
nfs_refresh_inode(inode, fattr);
@@ -203,10 +203,10 @@ static int nfs_proc_write(struct nfs_write_data *wdata)
dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
(long long) wdata->args.offset);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
if (status >= 0) {
- nfs_refresh_inode(inode, fattr);
+ nfs_post_op_update_inode(inode, fattr);
wdata->res.count = wdata->args.count;
wdata->verf.committed = NFS_FILE_SYNC;
}
@@ -216,7 +216,7 @@ static int nfs_proc_write(struct nfs_write_data *wdata)
static int
nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
- int flags)
+ int flags, struct nameidata *nd)
{
struct nfs_fh fhandle;
struct nfs_fattr fattr;
@@ -232,7 +232,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
};
int status;
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
dprintk("NFS call create %s\n", dentry->d_name.name);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
if (status == 0)
@@ -273,12 +273,13 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
}
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+ nfs_mark_for_revalidate(dir);
if (status == -EINVAL && S_ISFIFO(mode)) {
sattr->ia_mode = mode;
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
}
if (status == 0)
@@ -305,6 +306,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
dprintk("NFS call remove %s\n", name->name);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
dprintk("NFS reply remove: %d\n", status);
return status;
@@ -331,8 +333,10 @@ nfs_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
{
struct rpc_message *msg = &task->tk_msg;
- if (msg->rpc_argp)
+ if (msg->rpc_argp) {
+ nfs_mark_for_revalidate(dir->d_inode);
kfree(msg->rpc_argp);
+ }
return 0;
}
@@ -352,6 +356,8 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0);
+ nfs_mark_for_revalidate(old_dir);
+ nfs_mark_for_revalidate(new_dir);
dprintk("NFS reply rename: %d\n", status);
return status;
}
@@ -369,6 +375,7 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
dprintk("NFS call link %s\n", name->name);
status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0);
+ nfs_mark_for_revalidate(dir);
dprintk("NFS reply link: %d\n", status);
return status;
}
@@ -391,9 +398,10 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
if (path->len > NFS2_MAXPATHLEN)
return -ENAMETOOLONG;
dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
- fattr->valid = 0;
+ nfs_fattr_init(fattr);
fhandle->size = 0;
status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0);
+ nfs_mark_for_revalidate(dir);
dprintk("NFS reply symlink: %d\n", status);
return status;
}
@@ -416,8 +424,9 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
int status;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
- fattr.valid = 0;
+ nfs_fattr_init(&fattr);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0);
+ nfs_mark_for_revalidate(dir);
if (status == 0)
status = nfs_instantiate(dentry, &fhandle, &fattr);
dprintk("NFS reply mkdir: %d\n", status);
@@ -436,6 +445,7 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
dprintk("NFS call rmdir %s\n", name->name);
status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0);
+ nfs_mark_for_revalidate(dir);
dprintk("NFS reply rmdir: %d\n", status);
return status;
}
@@ -484,7 +494,7 @@ nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call statfs\n");
- stat->fattr->valid = 0;
+ nfs_fattr_init(stat->fattr);
status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
dprintk("NFS reply statfs: %d\n", status);
if (status)
@@ -507,7 +517,7 @@ nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
int status;
dprintk("NFS call fsinfo\n");
- info->fattr->valid = 0;
+ nfs_fattr_init(info->fattr);
status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
dprintk("NFS reply fsinfo: %d\n", status);
if (status)
@@ -579,7 +589,7 @@ nfs_write_done(struct rpc_task *task)
struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
if (task->tk_status >= 0)
- nfs_refresh_inode(data->inode, data->res.fattr);
+ nfs_post_op_update_inode(data->inode, data->res.fattr);
nfs_writeback_done(task);
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 9758ebd4990..43b03b19731 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -215,6 +215,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
data->res.fattr = &data->fattr;
data->res.count = count;
data->res.eof = 0;
+ nfs_fattr_init(&data->fattr);
NFS_PROTO(inode)->read_setup(data);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5130eda231d..819a65f5071 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -870,6 +870,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
data->res.fattr = &data->fattr;
data->res.count = count;
data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
NFS_PROTO(inode)->write_setup(data, how);
@@ -1237,6 +1238,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
data->res.count = 0;
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
NFS_PROTO(inode)->commit_setup(data, how);
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index de58579a1d0..50a7749cfca 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,18 +1,15 @@
ToDo/Notes:
- Find and fix bugs.
- - In between ntfs_prepare/commit_write, need exclusion between
- simultaneous file extensions. This is given to us by holding i_sem
- on the inode. The only places in the kernel when a file is resized
- are prepare/commit write and truncate for both of which i_sem is
- held. Just have to be careful in readpage/writepage and all other
- helpers not running under i_sem that we play nice...
- Also need to be careful with initialized_size extention in
- ntfs_prepare_write. Basically, just be _very_ careful in this code...
- UPDATE: The only things that need to be checked are read/writepage
- which do not hold i_sem. Note writepage cannot change i_size but it
- needs to cope with a concurrent i_size change, just like readpage.
- Also both need to cope with concurrent changes to the other sizes,
- i.e. initialized/allocated/compressed size, as well.
+ - The only places in the kernel where a file is resized are
+ ntfs_file_write*() and ntfs_truncate() for both of which i_sem is
+ held. Just have to be careful in read-/writepage and other helpers
+ not running under i_sem that we play nice... Also need to be careful
+ with initialized_size extension in ntfs_file_write*() and writepage.
+ UPDATE: The only things that need to be checked are the compressed
+ write and the other attribute resize/write cases like index
+ attributes, etc. For now none of these are implemented so are safe.
+ - Implement filling in of holes in aops.c::ntfs_writepage() and its
+ helpers.
- Implement mft.c::sync_mft_mirror_umount(). We currently will just
leave the volume dirty on umount if the final iput(vol->mft_ino)
causes a write of any mirrored mft records due to the mft mirror
@@ -22,6 +19,68 @@ ToDo/Notes:
- Enable the code for setting the NT4 compatibility flag when we start
making NTFS 1.2 specific modifications.
+2.1.25 - (Almost) fully implement write(2) and truncate(2).
+
+ - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
+ {__,}ntfs_cluster_free() to also take an optional attribute search
+ context as argument. This allows calling these functions with the
+ mft record mapped. Update all callers.
+ - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
+ error handling by passing in the active search context when calling
+ ntfs_cluster_free().
+ - Change ntfs_cluster_alloc() to take an extra boolean parameter
+ specifying whether the cluster are being allocated to extend an
+ attribute or to fill a hole.
+ - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
+ with @is_extension set to TRUE and remove the runlist terminator
+ fixup code as this is now done by ntfs_cluster_alloc().
+ - Change ntfs_attr_make_non_resident to take the attribute value size
+ as an extra parameter. This is needed since we need to know the size
+ before we can map the mft record and our callers always know it. The
+ reason we cannot simply read the size from the vfs inode i_size is
+ that this is not necessarily uptodate. This happens when
+ ntfs_attr_make_non_resident() is called in the ->truncate call path.
+ - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
+ which is zero for a resident attribute but should no longer be zero
+ once the attribute is non-resident as it then has real clusters
+ allocated.
+ - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
+ extend the allocation of an attributes. Optionally, the data size,
+ but not the initialized size can be extended, too.
+ - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
+ uncompressed and unencrypted files and it never creates sparse files
+ at least for the moment (making a file sparse requires us to modify
+ its directory entries and we do not support directory operations at
+ the moment). Also, support for highly fragmented files, i.e. ones
+ whose data attribute is split across multiple extents, is severly
+ limited. When such a case is encountered, EOPNOTSUPP is returned.
+ - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
+ the initial implementation of file truncation. Now both open(2)ing
+ a file with the O_TRUNC flag and the {,f}truncate(2) system calls
+ will resize a file appropriately. The limitations are that only
+ uncompressed and unencrypted files are supported. Also, there is
+ only very limited support for highly fragmented files (the ones whose
+ $DATA attribute is split into multiple attribute extents).
+ - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
+ and cond_resched() in the main loop as we could be dirtying a lot of
+ pages and this ensures we play nice with the VM and the system as a
+ whole.
+ - Implement file operations ->write, ->aio_write, ->writev for regular
+ files. This replaces the old use of generic_file_write(), et al and
+ the address space operations ->prepare_write and ->commit_write.
+ This means that both sparse and non-sparse (unencrypted and
+ uncompressed) files can now be extended using the normal write(2)
+ code path. There are two limitations at present and these are that
+ we never create sparse files and that we only have limited support
+ for highly fragmented files, i.e. ones whose data attribute is split
+ across multiple extents. When such a case is encountered,
+ EOPNOTSUPP is returned.
+ - $EA attributes can be both resident and non-resident.
+ - Use %z for size_t to fix compilation warnings. (Andrew Morton)
+ - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
+ - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
+ patch by Yura Pakhuchiy.)
+
2.1.24 - Lots of bug fixes and support more clean journal states.
- Support journals ($LogFile) which have been modified by chkdsk. This
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 894b2b876d3..d0d45d1c853 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\"
ifeq ($(CONFIG_NTFS_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 5e80c07c6a4..1c0a4315876 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1391,8 +1391,7 @@ retry_writepage:
if (NInoEncrypted(ni)) {
unlock_page(page);
BUG_ON(ni->type != AT_DATA);
- ntfs_debug("Denying write access to encrypted "
- "file.");
+ ntfs_debug("Denying write access to encrypted file.");
return -EACCES;
}
/* Compressed data streams are handled in compress.c. */
@@ -1508,8 +1507,8 @@ retry_writepage:
/* Zero out of bounds area in the page cache page. */
memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_mft_record_page(ctx->ntfs_ino);
flush_dcache_page(page);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
/* We are done with the page. */
end_page_writeback(page);
/* Finally, mark the mft record dirty, so it gets written back. */
@@ -1542,830 +1541,6 @@ err_out:
return err;
}
-/**
- * ntfs_prepare_nonresident_write -
- *
- */
-static int ntfs_prepare_nonresident_write(struct page *page,
- unsigned from, unsigned to)
-{
- VCN vcn;
- LCN lcn;
- s64 initialized_size;
- loff_t i_size;
- sector_t block, ablock, iblock;
- struct inode *vi;
- ntfs_inode *ni;
- ntfs_volume *vol;
- runlist_element *rl;
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
- unsigned long flags;
- unsigned int vcn_ofs, block_start, block_end, blocksize;
- int err;
- BOOL is_retry;
- unsigned char blocksize_bits;
-
- vi = page->mapping->host;
- ni = NTFS_I(vi);
- vol = ni->vol;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
- page->index, from, to);
-
- BUG_ON(!NInoNonResident(ni));
-
- blocksize_bits = vi->i_blkbits;
- blocksize = 1 << blocksize_bits;
-
- /*
- * create_empty_buffers() will create uptodate/dirty buffers if the
- * page is uptodate/dirty.
- */
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- bh = head = page_buffers(page);
- if (unlikely(!bh))
- return -ENOMEM;
-
- /* The first block in the page. */
- block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
-
- read_lock_irqsave(&ni->size_lock, flags);
- /*
- * The first out of bounds block for the allocated size. No need to
- * round up as allocated_size is in multiples of cluster size and the
- * minimum cluster size is 512 bytes, which is equal to the smallest
- * blocksize.
- */
- ablock = ni->allocated_size >> blocksize_bits;
- i_size = i_size_read(vi);
- initialized_size = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
-
- /* The last (fully or partially) initialized block. */
- iblock = initialized_size >> blocksize_bits;
-
- /* Loop through all the buffers in the page. */
- block_start = 0;
- rl = NULL;
- err = 0;
- do {
- block_end = block_start + blocksize;
- /*
- * If buffer @bh is outside the write, just mark it uptodate
- * if the page is uptodate and continue with the next buffer.
- */
- if (block_end <= from || block_start >= to) {
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- }
- continue;
- }
- /*
- * @bh is at least partially being written to.
- * Make sure it is not marked as new.
- */
- //if (buffer_new(bh))
- // clear_buffer_new(bh);
-
- if (block >= ablock) {
- // TODO: block is above allocated_size, need to
- // allocate it. Best done in one go to accommodate not
- // only block but all above blocks up to and including:
- // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
- // - 1) >> blobksize_bits. Obviously will need to round
- // up to next cluster boundary, too. This should be
- // done with a helper function, so it can be reused.
- ntfs_error(vol->sb, "Writing beyond allocated size "
- "is not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- goto err_out;
- // Need to update ablock.
- // Need to set_buffer_new() on all block bhs that are
- // newly allocated.
- }
- /*
- * Now we have enough allocated size to fulfill the whole
- * request, i.e. block < ablock is true.
- */
- if (unlikely((block >= iblock) &&
- (initialized_size < i_size))) {
- /*
- * If this page is fully outside initialized size, zero
- * out all pages between the current initialized size
- * and the current page. Just use ntfs_readpage() to do
- * the zeroing transparently.
- */
- if (block > iblock) {
- // TODO:
- // For each page do:
- // - read_cache_page()
- // Again for each page do:
- // - wait_on_page_locked()
- // - Check (PageUptodate(page) &&
- // !PageError(page))
- // Update initialized size in the attribute and
- // in the inode.
- // Again, for each page do:
- // __set_page_dirty_buffers();
- // page_cache_release()
- // We don't need to wait on the writes.
- // Update iblock.
- }
- /*
- * The current page straddles initialized size. Zero
- * all non-uptodate buffers and set them uptodate (and
- * dirty?). Note, there aren't any non-uptodate buffers
- * if the page is uptodate.
- * FIXME: For an uptodate page, the buffers may need to
- * be written out because they were not initialized on
- * disk before.
- */
- if (!PageUptodate(page)) {
- // TODO:
- // Zero any non-uptodate buffers up to i_size.
- // Set them uptodate and dirty.
- }
- // TODO:
- // Update initialized size in the attribute and in the
- // inode (up to i_size).
- // Update iblock.
- // FIXME: This is inefficient. Try to batch the two
- // size changes to happen in one go.
- ntfs_error(vol->sb, "Writing beyond initialized size "
- "is not supported yet. Sorry.");
- err = -EOPNOTSUPP;
- goto err_out;
- // Do NOT set_buffer_new() BUT DO clear buffer range
- // outside write request range.
- // set_buffer_uptodate() on complete buffers as well as
- // set_buffer_dirty().
- }
-
- /* Need to map unmapped buffers. */
- if (!buffer_mapped(bh)) {
- /* Unmapped buffer. Need to map it. */
- bh->b_bdev = vol->sb->s_bdev;
-
- /* Convert block into corresponding vcn and offset. */
- vcn = (VCN)block << blocksize_bits >>
- vol->cluster_size_bits;
- vcn_ofs = ((VCN)block << blocksize_bits) &
- vol->cluster_size_mask;
-
- is_retry = FALSE;
- if (!rl) {
-lock_retry_remap:
- down_read(&ni->runlist.lock);
- rl = ni->runlist.rl;
- }
- if (likely(rl != NULL)) {
- /* Seek to element containing target vcn. */
- while (rl->length && rl[1].vcn <= vcn)
- rl++;
- lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
- } else
- lcn = LCN_RL_NOT_MAPPED;
- if (unlikely(lcn < 0)) {
- /*
- * We extended the attribute allocation above.
- * If we hit an ENOENT here it means that the
- * allocation was insufficient which is a bug.
- */
- BUG_ON(lcn == LCN_ENOENT);
-
- /* It is a hole, need to instantiate it. */
- if (lcn == LCN_HOLE) {
- // TODO: Instantiate the hole.
- // clear_buffer_new(bh);
- // unmap_underlying_metadata(bh->b_bdev,
- // bh->b_blocknr);
- // For non-uptodate buffers, need to
- // zero out the region outside the
- // request in this bh or all bhs,
- // depending on what we implemented
- // above.
- // Need to flush_dcache_page().
- // Or could use set_buffer_new()
- // instead?
- ntfs_error(vol->sb, "Writing into "
- "sparse regions is "
- "not supported yet. "
- "Sorry.");
- err = -EOPNOTSUPP;
- if (!rl)
- up_read(&ni->runlist.lock);
- goto err_out;
- } else if (!is_retry &&
- lcn == LCN_RL_NOT_MAPPED) {
- is_retry = TRUE;
- /*
- * Attempt to map runlist, dropping
- * lock for the duration.
- */
- up_read(&ni->runlist.lock);
- err = ntfs_map_runlist(ni, vcn);
- if (likely(!err))
- goto lock_retry_remap;
- rl = NULL;
- } else if (!rl)
- up_read(&ni->runlist.lock);
- /*
- * Failed to map the buffer, even after
- * retrying.
- */
- if (!err)
- err = -EIO;
- bh->b_blocknr = -1;
- ntfs_error(vol->sb, "Failed to write to inode "
- "0x%lx, attribute type 0x%x, "
- "vcn 0x%llx, offset 0x%x "
- "because its location on disk "
- "could not be determined%s "
- "(error code %i).",
- ni->mft_no, ni->type,
- (unsigned long long)vcn,
- vcn_ofs, is_retry ? " even "
- "after retrying" : "", err);
- goto err_out;
- }
- /* We now have a successful remap, i.e. lcn >= 0. */
-
- /* Setup buffer head to correct block. */
- bh->b_blocknr = ((lcn << vol->cluster_size_bits)
- + vcn_ofs) >> blocksize_bits;
- set_buffer_mapped(bh);
-
- // FIXME: Something analogous to this is needed for
- // each newly allocated block, i.e. BH_New.
- // FIXME: Might need to take this out of the
- // if (!buffer_mapped(bh)) {}, depending on how we
- // implement things during the allocated_size and
- // initialized_size extension code above.
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
- if (PageUptodate(page)) {
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * Page is _not_ uptodate, zero surrounding
- * region. NOTE: This is how we decide if to
- * zero or not!
- */
- if (block_end > to || block_start < from) {
- void *kaddr;
-
- kaddr = kmap_atomic(page, KM_USER0);
- if (block_end > to)
- memset(kaddr + to, 0,
- block_end - to);
- if (block_start < from)
- memset(kaddr + block_start, 0,
- from -
- block_start);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- }
- continue;
- }
- }
- /* @bh is mapped, set it uptodate if the page is uptodate. */
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- continue;
- }
- /*
- * The page is not uptodate. The buffer is mapped. If it is not
- * uptodate, and it is only partially being written to, we need
- * to read the buffer in before the write, i.e. right now.
- */
- if (!buffer_uptodate(bh) &&
- (block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);
- *wait_bh++ = bh;
- }
- } while (block++, block_start = block_end,
- (bh = bh->b_this_page) != head);
-
- /* Release the lock if we took it. */
- if (rl) {
- up_read(&ni->runlist.lock);
- rl = NULL;
- }
-
- /* If we issued read requests, let them complete. */
- while (wait_bh > wait) {
- wait_on_buffer(*--wait_bh);
- if (!buffer_uptodate(*wait_bh))
- return -EIO;
- }
-
- ntfs_debug("Done.");
- return 0;
-err_out:
- /*
- * Zero out any newly allocated blocks to avoid exposing stale data.
- * If BH_New is set, we know that the block was newly allocated in the
- * above loop.
- * FIXME: What about initialized_size increments? Have we done all the
- * required zeroing above? If not this error handling is broken, and
- * in particular the if (block_end <= from) check is completely bogus.
- */
- bh = head;
- block_start = 0;
- is_retry = FALSE;
- do {
- block_end = block_start + blocksize;
- if (block_end <= from)
- continue;
- if (block_start >= to)
- break;
- if (buffer_new(bh)) {
- void *kaddr;
-
- clear_buffer_new(bh);
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + block_start, 0, bh->b_size);
- kunmap_atomic(kaddr, KM_USER0);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- is_retry = TRUE;
- }
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- if (is_retry)
- flush_dcache_page(page);
- if (rl)
- up_read(&ni->runlist.lock);
- return err;
-}
-
-/**
- * ntfs_prepare_write - prepare a page for receiving data
- *
- * This is called from generic_file_write() with i_sem held on the inode
- * (@page->mapping->host). The @page is locked but not kmap()ped. The source
- * data has not yet been copied into the @page.
- *
- * Need to extend the attribute/fill in holes if necessary, create blocks and
- * make partially overwritten blocks uptodate,
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- *
- * Should be using block_prepare_write() [support for sparse files] or
- * cont_prepare_write() [no support for sparse files]. Cannot do that due to
- * ntfs specifics but can look at them for implementation guidance.
- *
- * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
- * the first byte in the page that will be written to and @to is the first byte
- * after the last byte that will be written to.
- */
-static int ntfs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- s64 new_size;
- loff_t i_size;
- struct inode *vi = page->mapping->host;
- ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
- ntfs_volume *vol = ni->vol;
- ntfs_attr_search_ctx *ctx = NULL;
- MFT_RECORD *m = NULL;
- ATTR_RECORD *a;
- u8 *kaddr;
- u32 attr_len;
- int err;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
- page->index, from, to);
- BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
- BUG_ON(from > to);
- BUG_ON(NInoMstProtected(ni));
- /*
- * If a previous ntfs_truncate() failed, repeat it and abort if it
- * fails again.
- */
- if (unlikely(NInoTruncateFailed(ni))) {
- down_write(&vi->i_alloc_sem);
- err = ntfs_truncate(vi);
- up_write(&vi->i_alloc_sem);
- if (err || NInoTruncateFailed(ni)) {
- if (!err)
- err = -EIO;
- goto err_out;
- }
- }
- /* If the attribute is not resident, deal with it elsewhere. */
- if (NInoNonResident(ni)) {
- /*
- * Only unnamed $DATA attributes can be compressed, encrypted,
- * and/or sparse.
- */
- if (ni->type == AT_DATA && !ni->name_len) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- ntfs_debug("Denying write access to encrypted "
- "file.");
- return -EACCES;
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoCompressed(ni)) {
- // TODO: Implement and replace this check with
- // return ntfs_write_compressed_block(page);
- ntfs_error(vi->i_sb, "Writing to compressed "
- "files is not supported yet. "
- "Sorry.");
- return -EOPNOTSUPP;
- }
- // TODO: Implement and remove this check.
- if (NInoSparse(ni)) {
- ntfs_error(vi->i_sb, "Writing to sparse files "
- "is not supported yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
- /* Normal data stream. */
- return ntfs_prepare_nonresident_write(page, from, to);
- }
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * sparse.
- */
- BUG_ON(page_has_buffers(page));
- new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
- /* If we do not need to resize the attribute allocation we are done. */
- if (new_size <= i_size_read(vi))
- goto done;
- /* Map, pin, and lock the (base) mft record. */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- m = ctx->mrec;
- a = ctx->attr;
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- /* Fix an eventual previous failure of ntfs_commit_write(). */
- i_size = i_size_read(vi);
- if (unlikely(attr_len > i_size)) {
- attr_len = i_size;
- a->data.resident.value_length = cpu_to_le32(attr_len);
- }
- /* If we do not need to resize the attribute allocation we are done. */
- if (new_size <= attr_len)
- goto done_unm;
- /* Check if new size is allowed in $AttrDef. */
- err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
- if (unlikely(err)) {
- if (err == -ERANGE) {
- ntfs_error(vol->sb, "Write would cause the inode "
- "0x%lx to exceed the maximum size for "
- "its attribute type (0x%x). Aborting "
- "write.", vi->i_ino,
- le32_to_cpu(ni->type));
- } else {
- ntfs_error(vol->sb, "Inode 0x%lx has unknown "
- "attribute type 0x%x. Aborting "
- "write.", vi->i_ino,
- le32_to_cpu(ni->type));
- err = -EIO;
- }
- goto err_out2;
- }
- /*
- * Extend the attribute record to be able to store the new attribute
- * size.
- */
- if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
- le16_to_cpu(a->data.resident.value_offset) +
- new_size)) {
- /* Not enough space in the mft record. */
- ntfs_error(vol->sb, "Not enough space in the mft record for "
- "the resized attribute value. This is not "
- "supported yet. Aborting write.");
- err = -EOPNOTSUPP;
- goto err_out2;
- }
- /*
- * We have enough space in the mft record to fit the write. This
- * implies the attribute is smaller than the mft record and hence the
- * attribute must be in a single page and hence page->index must be 0.
- */
- BUG_ON(page->index);
- /*
- * If the beginning of the write is past the old size, enlarge the
- * attribute value up to the beginning of the write and fill it with
- * zeroes.
- */
- if (from > attr_len) {
- memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
- attr_len, 0, from - attr_len);
- a->data.resident.value_length = cpu_to_le32(from);
- /* Zero the corresponding area in the page as well. */
- if (PageUptodate(page)) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + attr_len, 0, from - attr_len);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(page);
- }
- }
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
-done_unm:
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- /*
- * Because resident attributes are handled by memcpy() to/from the
- * corresponding MFT record, and because this form of i/o is byte
- * aligned rather than block aligned, there is no need to bring the
- * page uptodate here as in the non-resident case where we need to
- * bring the buffers straddled by the write uptodate before
- * generic_file_write() does the copying from userspace.
- *
- * We thus defer the uptodate bringing of the page region outside the
- * region written to to ntfs_commit_write(), which makes the code
- * simpler and saves one atomic kmap which is good.
- */
-done:
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (err == -ENOMEM)
- ntfs_warning(vi->i_sb, "Error allocating memory required to "
- "prepare the write.");
- else {
- ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
- "with error %i.", err);
- NVolSetErrors(vol);
- make_bad_inode(vi);
- }
-err_out2:
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
-/**
- * ntfs_commit_nonresident_write -
- *
- */
-static int ntfs_commit_nonresident_write(struct page *page,
- unsigned from, unsigned to)
-{
- s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
- struct inode *vi = page->mapping->host;
- struct buffer_head *bh, *head;
- unsigned int block_start, block_end, blocksize;
- BOOL partial;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx, from = %u, to = %u.", vi->i_ino,
- NTFS_I(vi)->type, page->index, from, to);
- blocksize = 1 << vi->i_blkbits;
-
- // FIXME: We need a whole slew of special cases in here for compressed
- // files for example...
- // For now, we know ntfs_prepare_write() would have failed so we can't
- // get here in any of the cases which we have to special case, so we
- // are just a ripped off, unrolled generic_commit_write().
-
- bh = head = page_buffers(page);
- block_start = 0;
- partial = FALSE;
- do {
- block_end = block_start + blocksize;
- if (block_end <= from || block_start >= to) {
- if (!buffer_uptodate(bh))
- partial = TRUE;
- } else {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- }
- } while (block_start = block_end, (bh = bh->b_this_page) != head);
- /*
- * If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus ->readpage() for the next
- * read(). Here we 'discover' whether the page went uptodate as a
- * result of this (potentially partial) write.
- */
- if (!partial)
- SetPageUptodate(page);
- /*
- * Not convinced about this at all. See disparity comment above. For
- * now we know ntfs_prepare_write() would have failed in the write
- * exceeds i_size case, so this will never trigger which is fine.
- */
- if (pos > i_size_read(vi)) {
- ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
- "not supported yet. Sorry.");
- return -EOPNOTSUPP;
- // vi->i_size = pos;
- // mark_inode_dirty(vi);
- }
- ntfs_debug("Done.");
- return 0;
-}
-
-/**
- * ntfs_commit_write - commit the received data
- *
- * This is called from generic_file_write() with i_sem held on the inode
- * (@page->mapping->host). The @page is locked but not kmap()ped. The source
- * data has already been copied into the @page. ntfs_prepare_write() has been
- * called before the data copied and it returned success so we can take the
- * results of various BUG checks and some error handling for granted.
- *
- * Need to mark modified blocks dirty so they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Return 0 on success or -errno on error.
- *
- * Should be using generic_commit_write(). This marks buffers uptodate and
- * dirty, sets the page uptodate if all buffers in the page are uptodate, and
- * updates i_size if the end of io is beyond i_size. In that case, it also
- * marks the inode dirty.
- *
- * Cannot use generic_commit_write() due to ntfs specialities but can look at
- * it for implementation guidance.
- *
- * If things have gone as outlined in ntfs_prepare_write(), then we do not
- * need to do any page content modifications here at all, except in the write
- * to resident attribute case, where we need to do the uptodate bringing here
- * which we combine with the copying into the mft record which means we save
- * one atomic kmap.
- */
-static int ntfs_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- struct inode *vi = page->mapping->host;
- ntfs_inode *base_ni, *ni = NTFS_I(vi);
- char *kaddr, *kattr;
- ntfs_attr_search_ctx *ctx;
- MFT_RECORD *m;
- ATTR_RECORD *a;
- u32 attr_len;
- int err;
-
- ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
- "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
- page->index, from, to);
- /* If the attribute is not resident, deal with it elsewhere. */
- if (NInoNonResident(ni)) {
- /* Only unnamed $DATA attributes can be compressed/encrypted. */
- if (ni->type == AT_DATA && !ni->name_len) {
- /* Encrypted files need separate handling. */
- if (NInoEncrypted(ni)) {
- // We never get here at present!
- BUG();
- }
- /* Compressed data streams are handled in compress.c. */
- if (NInoCompressed(ni)) {
- // TODO: Implement this!
- // return ntfs_write_compressed_block(page);
- // We never get here at present!
- BUG();
- }
- }
- /* Normal data stream. */
- return ntfs_commit_nonresident_write(page, from, to);
- }
- /*
- * Attribute is resident, implying it is not compressed, encrypted, or
- * sparse.
- */
- if (!NInoAttr(ni))
- base_ni = ni;
- else
- base_ni = ni->ext.base_ntfs_ino;
- /* Map, pin, and lock the mft record. */
- m = map_mft_record(base_ni);
- if (IS_ERR(m)) {
- err = PTR_ERR(m);
- m = NULL;
- ctx = NULL;
- goto err_out;
- }
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
- }
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, 0, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
- }
- a = ctx->attr;
- /* The total length of the attribute value. */
- attr_len = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(from > attr_len);
- kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
- kaddr = kmap_atomic(page, KM_USER0);
- /* Copy the received data from the page to the mft record. */
- memcpy(kattr + from, kaddr + from, to - from);
- /* Update the attribute length if necessary. */
- if (to > attr_len) {
- attr_len = to;
- a->data.resident.value_length = cpu_to_le32(attr_len);
- }
- /*
- * If the page is not uptodate, bring the out of bounds area(s)
- * uptodate by copying data from the mft record to the page.
- */
- if (!PageUptodate(page)) {
- if (from > 0)
- memcpy(kaddr, kattr, from);
- if (to < attr_len)
- memcpy(kaddr + to, kattr + to, attr_len - to);
- /* Zero the region outside the end of the attribute value. */
- if (attr_len < PAGE_CACHE_SIZE)
- memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
- /*
- * The probability of not having done any of the above is
- * extremely small, so we just flush unconditionally.
- */
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
- kunmap_atomic(kaddr, KM_USER0);
- /* Update i_size if necessary. */
- if (i_size_read(vi) < attr_len) {
- unsigned long flags;
-
- write_lock_irqsave(&ni->size_lock, flags);
- ni->allocated_size = ni->initialized_size = attr_len;
- i_size_write(vi, attr_len);
- write_unlock_irqrestore(&ni->size_lock, flags);
- }
- /* Mark the mft record dirty, so it gets written back. */
- flush_dcache_mft_record_page(ctx->ntfs_ino);
- mark_mft_record_dirty(ctx->ntfs_ino);
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (err == -ENOMEM) {
- ntfs_warning(vi->i_sb, "Error allocating memory required to "
- "commit the write.");
- if (PageUptodate(page)) {
- ntfs_warning(vi->i_sb, "Page is uptodate, setting "
- "dirty so the write will be retried "
- "later on by the VM.");
- /*
- * Put the page on mapping->dirty_pages, but leave its
- * buffers' dirty state as-is.
- */
- __set_page_dirty_nobuffers(page);
- err = 0;
- } else
- ntfs_error(vi->i_sb, "Page is not uptodate. Written "
- "data has been lost.");
- } else {
- ntfs_error(vi->i_sb, "Resident attribute commit write failed "
- "with error %i.", err);
- NVolSetErrors(ni->vol);
- make_bad_inode(vi);
- }
- if (ctx)
- ntfs_attr_put_search_ctx(ctx);
- if (m)
- unmap_mft_record(base_ni);
- return err;
-}
-
#endif /* NTFS_RW */
/**
@@ -2377,9 +1552,6 @@ struct address_space_operations ntfs_aops = {
disk request queue. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
- .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
- ready to receive data. */
- .commit_write = ntfs_commit_write, /* Commit received data. */
#endif /* NTFS_RW */
};
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 3f9a4ff42ee..eda056bac25 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -21,7 +21,9 @@
*/
#include <linux/buffer_head.h>
+#include <linux/sched.h>
#include <linux/swap.h>
+#include <linux/writeback.h>
#include "attrib.h"
#include "debug.h"
@@ -36,9 +38,27 @@
* ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
* @ni: ntfs inode for which to map (part of) a runlist
* @vcn: map runlist part containing this vcn
+ * @ctx: active attribute search context if present or NULL if not
*
* Map the part of a runlist containing the @vcn of the ntfs inode @ni.
*
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
+ * restores it before returning. Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry. However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_map_runlist_nolock(), you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
* Return 0 on success and -errno on error. There is one special error code
* which is not an error as such. This is -ENOENT. It means that @vcn is out
* of bounds of the runlist.
@@ -46,19 +66,32 @@
* Note the runlist can be NULL after this function returns if @vcn is zero and
* the attribute has zero allocated size, i.e. there simply is no runlist.
*
- * Locking: - The runlist must be locked for writing.
- * - This function modifies the runlist.
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist will be modified.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
*/
-int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
{
VCN end_vcn;
+ unsigned long flags;
ntfs_inode *base_ni;
MFT_RECORD *m;
ATTR_RECORD *a;
- ntfs_attr_search_ctx *ctx;
runlist_element *rl;
- unsigned long flags;
+ struct page *put_this_page = NULL;
int err = 0;
+ BOOL ctx_is_temporary, ctx_needs_reset;
+ ntfs_attr_search_ctx old_ctx = { NULL, };
ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
(unsigned long long)vcn);
@@ -66,20 +99,77 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
base_ni = ni;
else
base_ni = ni->ext.base_ntfs_ino;
- m = map_mft_record(base_ni);
- if (IS_ERR(m))
- return PTR_ERR(m);
- ctx = ntfs_attr_get_search_ctx(base_ni, m);
- if (unlikely(!ctx)) {
- err = -ENOMEM;
- goto err_out;
+ if (!ctx) {
+ ctx_is_temporary = ctx_needs_reset = TRUE;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ } else {
+ VCN allocated_size_vcn;
+
+ BUG_ON(IS_ERR(ctx->mrec));
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ ctx_is_temporary = FALSE;
+ end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size_vcn = ni->allocated_size >>
+ ni->vol->cluster_size_bits;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
+ end_vcn = allocated_size_vcn - 1;
+ /*
+ * If we already have the attribute extent containing @vcn in
+ * @ctx, no need to look it up again. We slightly cheat in
+ * that if vcn exceeds the allocated size, we will refuse to
+ * map the runlist below, so there is definitely no need to get
+ * the right attribute extent.
+ */
+ if (vcn >= allocated_size_vcn || (a->type == ni->type &&
+ a->name_length == ni->name_len &&
+ !memcmp((u8*)a + le16_to_cpu(a->name_offset),
+ ni->name, ni->name_len) &&
+ sle64_to_cpu(a->data.non_resident.lowest_vcn)
+ <= vcn && end_vcn >= vcn))
+ ctx_needs_reset = FALSE;
+ else {
+ /* Save the old search context. */
+ old_ctx = *ctx;
+ /*
+ * If the currently mapped (extent) inode is not the
+ * base inode we will unmap it when we reinitialize the
+ * search context which means we need to get a
+ * reference to the page containing the mapped mft
+ * record so we do not accidentally drop changes to the
+ * mft record when it has not been marked dirty yet.
+ */
+ if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
+ old_ctx.base_ntfs_ino) {
+ put_this_page = old_ctx.ntfs_ino->page;
+ page_cache_get(put_this_page);
+ }
+ /*
+ * Reinitialize the search context so we can lookup the
+ * needed attribute extent.
+ */
+ ntfs_attr_reinit_search_ctx(ctx);
+ ctx_needs_reset = TRUE;
+ }
}
- err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
- CASE_SENSITIVE, vcn, NULL, 0, ctx);
- if (unlikely(err)) {
- if (err == -ENOENT)
- err = -EIO;
- goto err_out;
+ if (ctx_needs_reset) {
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, vcn, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ BUG_ON(!ctx->attr->non_resident);
}
a = ctx->attr;
/*
@@ -89,11 +179,9 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
* ntfs_mapping_pairs_decompress() fails.
*/
end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
- if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) {
- read_lock_irqsave(&ni->size_lock, flags);
- end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits;
- read_unlock_irqrestore(&ni->size_lock, flags);
- }
+ if (!a->data.non_resident.lowest_vcn && end_vcn == 1)
+ end_vcn = sle64_to_cpu(a->data.non_resident.allocated_size) >>
+ ni->vol->cluster_size_bits;
if (unlikely(vcn >= end_vcn)) {
err = -ENOENT;
goto err_out;
@@ -104,9 +192,93 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
else
ni->runlist.rl = rl;
err_out:
- if (likely(ctx))
- ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(base_ni);
+ if (ctx_is_temporary) {
+ if (likely(ctx))
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ } else if (ctx_needs_reset) {
+ /*
+ * If there is no attribute list, restoring the search context
+ * is acomplished simply by copying the saved context back over
+ * the caller supplied context. If there is an attribute list,
+ * things are more complicated as we need to deal with mapping
+ * of mft records and resulting potential changes in pointers.
+ */
+ if (NInoAttrList(base_ni)) {
+ /*
+ * If the currently mapped (extent) inode is not the
+ * one we had before, we need to unmap it and map the
+ * old one.
+ */
+ if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
+ /*
+ * If the currently mapped inode is not the
+ * base inode, unmap it.
+ */
+ if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
+ ctx->base_ntfs_ino) {
+ unmap_extent_mft_record(ctx->ntfs_ino);
+ ctx->mrec = ctx->base_mrec;
+ BUG_ON(!ctx->mrec);
+ }
+ /*
+ * If the old mapped inode is not the base
+ * inode, map it.
+ */
+ if (old_ctx.base_ntfs_ino &&
+ old_ctx.ntfs_ino !=
+ old_ctx.base_ntfs_ino) {
+retry_map:
+ ctx->mrec = map_mft_record(
+ old_ctx.ntfs_ino);
+ /*
+ * Something bad has happened. If out
+ * of memory retry till it succeeds.
+ * Any other errors are fatal and we
+ * return the error code in ctx->mrec.
+ * Let the caller deal with it... We
+ * just need to fudge things so the
+ * caller can reinit and/or put the
+ * search context safely.
+ */
+ if (IS_ERR(ctx->mrec)) {
+ if (PTR_ERR(ctx->mrec) ==
+ -ENOMEM) {
+ schedule();
+ goto retry_map;
+ } else
+ old_ctx.ntfs_ino =
+ old_ctx.
+ base_ntfs_ino;
+ }
+ }
+ }
+ /* Update the changed pointers in the saved context. */
+ if (ctx->mrec != old_ctx.mrec) {
+ if (!IS_ERR(ctx->mrec))
+ old_ctx.attr = (ATTR_RECORD*)(
+ (u8*)ctx->mrec +
+ ((u8*)old_ctx.attr -
+ (u8*)old_ctx.mrec));
+ old_ctx.mrec = ctx->mrec;
+ }
+ }
+ /* Restore the search context to the saved one. */
+ *ctx = old_ctx;
+ /*
+ * We drop the reference on the page we took earlier. In the
+ * case that IS_ERR(ctx->mrec) is true this means we might lose
+ * some changes to the mft record that had been made between
+ * the last time it was marked dirty/written out and now. This
+ * at this stage is not a problem as the mapping error is fatal
+ * enough that the mft record cannot be written out anyway and
+ * the caller is very likely to shutdown the whole inode
+ * immediately and mark the volume dirty for chkdsk to pick up
+ * the pieces anyway.
+ */
+ if (put_this_page)
+ page_cache_release(put_this_page);
+ }
return err;
}
@@ -122,8 +294,8 @@ err_out:
* of bounds of the runlist.
*
* Locking: - The runlist must be unlocked on entry and is unlocked on return.
- * - This function takes the runlist lock for writing and modifies the
- * runlist.
+ * - This function takes the runlist lock for writing and may modify
+ * the runlist.
*/
int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
{
@@ -133,7 +305,7 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
/* Make sure someone else didn't do the work while we were sleeping. */
if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
LCN_RL_NOT_MAPPED))
- err = ntfs_map_runlist_nolock(ni, vcn);
+ err = ntfs_map_runlist_nolock(ni, vcn, NULL);
up_write(&ni->runlist.lock);
return err;
}
@@ -212,7 +384,7 @@ retry_remap:
goto retry_remap;
}
}
- err = ntfs_map_runlist_nolock(ni, vcn);
+ err = ntfs_map_runlist_nolock(ni, vcn, NULL);
if (!write_locked) {
up_write(&ni->runlist.lock);
down_read(&ni->runlist.lock);
@@ -236,9 +408,9 @@ retry_remap:
/**
* ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
- * @ni: ntfs inode describing the runlist to search
- * @vcn: vcn to find
- * @write_locked: true if the runlist is locked for writing
+ * @ni: ntfs inode describing the runlist to search
+ * @vcn: vcn to find
+ * @ctx: active attribute search context if present or NULL if not
*
* Find the virtual cluster number @vcn in the runlist described by the ntfs
* inode @ni and return the address of the runlist element containing the @vcn.
@@ -246,9 +418,22 @@ retry_remap:
* If the @vcn is not mapped yet, the attempt is made to map the attribute
* extent containing the @vcn and the vcn to lcn conversion is retried.
*
- * If @write_locked is true the caller has locked the runlist for writing and
- * if false for reading.
- *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
+ * restores it before returning. Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry. However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_attr_find_vcn_nolock(), you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
* Note you need to distinguish between the lcn of the returned runlist element
* being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
* read and allocate clusters on write.
@@ -263,22 +448,31 @@ retry_remap:
* -ENOMEM - Not enough memory to map runlist.
* -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
*
- * Locking: - The runlist must be locked on entry and is left locked on return.
- * - If @write_locked is FALSE, i.e. the runlist is locked for reading,
- * the lock may be dropped inside the function so you cannot rely on
- * the runlist still being the same when this function returns.
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist may be modified when
+ * needed runlist fragments need to be mapped.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
*/
runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
- const BOOL write_locked)
+ ntfs_attr_search_ctx *ctx)
{
unsigned long flags;
runlist_element *rl;
int err = 0;
BOOL is_retry = FALSE;
- ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
- ni->mft_no, (unsigned long long)vcn,
- write_locked ? "write" : "read");
+ ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
+ ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
BUG_ON(!ni);
BUG_ON(!NInoNonResident(ni));
BUG_ON(vcn < 0);
@@ -312,33 +506,22 @@ retry_remap:
}
if (!err && !is_retry) {
/*
- * The @vcn is in an unmapped region, map the runlist and
- * retry.
+ * If the search context is invalid we cannot map the unmapped
+ * region.
*/
- if (!write_locked) {
- up_read(&ni->runlist.lock);
- down_write(&ni->runlist.lock);
- if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
- LCN_RL_NOT_MAPPED)) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
+ if (IS_ERR(ctx->mrec))
+ err = PTR_ERR(ctx->mrec);
+ else {
+ /*
+ * The @vcn is in an unmapped region, map the runlist
+ * and retry.
+ */
+ err = ntfs_map_runlist_nolock(ni, vcn, ctx);
+ if (likely(!err)) {
+ is_retry = TRUE;
goto retry_remap;
}
}
- err = ntfs_map_runlist_nolock(ni, vcn);
- if (!write_locked) {
- up_write(&ni->runlist.lock);
- down_read(&ni->runlist.lock);
- }
- if (likely(!err)) {
- is_retry = TRUE;
- goto retry_remap;
- }
- /*
- * -EINVAL coming from a failed mapping attempt is equivalent
- * to i/o error for us as it should not happen in our code
- * paths.
- */
if (err == -EINVAL)
err = -EIO;
} else if (!err)
@@ -1011,6 +1194,7 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
ntfs_inode *base_ni;
ntfs_debug("Entering.");
+ BUG_ON(IS_ERR(ctx->mrec));
if (ctx->base_ntfs_ino)
base_ni = ctx->base_ntfs_ino;
else
@@ -1227,7 +1411,7 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
*/
int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
{
- if (type == AT_INDEX_ALLOCATION || type == AT_EA)
+ if (type == AT_INDEX_ALLOCATION)
return -EPERM;
return 0;
}
@@ -1319,10 +1503,17 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
/**
* ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
* @ni: ntfs inode describing the attribute to convert
+ * @data_size: size of the resident data to copy to the non-resident attribute
*
* Convert the resident ntfs attribute described by the ntfs inode @ni to a
* non-resident one.
*
+ * @data_size must be equal to the attribute value size. This is needed since
+ * we need to know the size before we can map the mft record and our callers
+ * always know it. The reason we cannot simply read the size from the vfs
+ * inode i_size is that this is not necessarily uptodate. This happens when
+ * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
+ *
* Return 0 on success and -errno on error. The following error return codes
* are defined:
* -EPERM - The attribute is not allowed to be non-resident.
@@ -1343,7 +1534,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
*
* Locking: - The caller must hold i_sem on the inode.
*/
-int ntfs_attr_make_non_resident(ntfs_inode *ni)
+int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
{
s64 new_size;
struct inode *vi = VFS_I(ni);
@@ -1381,11 +1572,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
* The size needs to be aligned to a cluster boundary for allocation
* purposes.
*/
- new_size = (i_size_read(vi) + vol->cluster_size - 1) &
+ new_size = (data_size + vol->cluster_size - 1) &
~(vol->cluster_size - 1);
if (new_size > 0) {
- runlist_element *rl2;
-
/*
* Will need the page later and since the page lock nests
* outside all ntfs locks, we need to get the page now.
@@ -1396,7 +1585,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
return -ENOMEM;
/* Start by allocating clusters to hold the attribute value. */
rl = ntfs_cluster_alloc(vol, 0, new_size >>
- vol->cluster_size_bits, -1, DATA_ZONE);
+ vol->cluster_size_bits, -1, DATA_ZONE, TRUE);
if (IS_ERR(rl)) {
err = PTR_ERR(rl);
ntfs_debug("Failed to allocate cluster%s, error code "
@@ -1405,12 +1594,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
err);
goto page_err_out;
}
- /* Change the runlist terminator to LCN_ENOENT. */
- rl2 = rl;
- while (rl2->length)
- rl2++;
- BUG_ON(rl2->lcn != LCN_RL_NOT_MAPPED);
- rl2->lcn = LCN_ENOENT;
} else {
rl = NULL;
page = NULL;
@@ -1473,7 +1656,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
* attribute value.
*/
attr_size = le32_to_cpu(a->data.resident.value_length);
- BUG_ON(attr_size != i_size_read(vi));
+ BUG_ON(attr_size != data_size);
if (page && !PageUptodate(page)) {
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, (u8*)a +
@@ -1538,7 +1721,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
ffs(ni->itype.compressed.block_size) - 1;
ni->itype.compressed.block_clusters = 1U <<
a->data.non_resident.compression_unit;
- }
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = ni->allocated_size >> 9;
write_unlock_irqrestore(&ni->size_lock, flags);
/*
* This needs to be last since the address space operations ->readpage
@@ -1652,6 +1837,640 @@ page_err_out:
}
/**
+ * ntfs_attr_extend_allocation - extend the allocated space of an attribute
+ * @ni: ntfs inode of the attribute whose allocation to extend
+ * @new_alloc_size: new size in bytes to which to extend the allocation to
+ * @new_data_size: new size in bytes to which to extend the data to
+ * @data_start: beginning of region which is required to be non-sparse
+ *
+ * Extend the allocated space of an attribute described by the ntfs inode @ni
+ * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
+ * implemented as a hole in the file (as long as both the volume and the ntfs
+ * inode @ni have sparse support enabled). If @data_start is >= 0, then the
+ * region between the old allocated size and @data_start - 1 may be made sparse
+ * but the regions between @data_start and @new_alloc_size must be backed by
+ * actual clusters.
+ *
+ * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
+ * of the attribute is extended to @new_data_size. Note that the i_size of the
+ * vfs inode is not updated. Only the data size in the base attribute record
+ * is updated. The caller has to update i_size separately if this is required.
+ * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
+ * size as well as for @new_data_size to be greater than @new_alloc_size.
+ *
+ * For resident attributes this involves resizing the attribute record and if
+ * necessary moving it and/or other attributes into extent mft records and/or
+ * converting the attribute to a non-resident attribute which in turn involves
+ * extending the allocation of a non-resident attribute as described below.
+ *
+ * For non-resident attributes this involves allocating clusters in the data
+ * zone on the volume (except for regions that are being made sparse) and
+ * extending the run list to describe the allocated clusters as well as
+ * updating the mapping pairs array of the attribute. This in turn involves
+ * resizing the attribute record and if necessary moving it and/or other
+ * attributes into extent mft records and/or splitting the attribute record
+ * into multiple extent attribute records.
+ *
+ * Also, the attribute list attribute is updated if present and in some of the
+ * above cases (the ones where extent mft records/attributes come into play),
+ * an attribute list attribute is created if not already present.
+ *
+ * Return the new allocated size on success and -errno on error. In the case
+ * that an error is encountered but a partial extension at least up to
+ * @data_start (if present) is possible, the allocation is partially extended
+ * and this is returned. This means the caller must check the returned size to
+ * determine if the extension was partial. If @data_start is -1 then partial
+ * allocations are not performed.
+ *
+ * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
+ *
+ * Locking: This function takes the runlist lock of @ni for writing as well as
+ * locking the mft record of the base ntfs inode. These locks are maintained
+ * throughout execution of the function. These locks are required so that the
+ * attribute can be resized safely and so that it can for example be converted
+ * from resident to non-resident safely.
+ *
+ * TODO: At present attribute list attribute handling is not implemented.
+ *
+ * TODO: At present it is not safe to call this function for anything other
+ * than the $DATA attribute(s) of an uncompressed and unencrypted file.
+ */
+s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+ const s64 new_data_size, const s64 data_start)
+{
+ VCN vcn;
+ s64 ll, allocated_size, start = data_start;
+ struct inode *vi = VFS_I(ni);
+ ntfs_volume *vol = ni->vol;
+ ntfs_inode *base_ni;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ runlist_element *rl, *rl2;
+ unsigned long flags;
+ int err, mp_size;
+ u32 attr_len = 0; /* Silence stupid gcc warning. */
+ BOOL mp_rebuilt;
+
+#ifdef NTFS_DEBUG
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+ "old_allocated_size 0x%llx, "
+ "new_allocated_size 0x%llx, new_data_size 0x%llx, "
+ "data_start 0x%llx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)allocated_size,
+ (unsigned long long)new_alloc_size,
+ (unsigned long long)new_data_size,
+ (unsigned long long)start);
+#endif
+retry_extend:
+ /*
+ * For non-resident attributes, @start and @new_size need to be aligned
+ * to cluster boundaries for allocation purposes.
+ */
+ if (NInoNonResident(ni)) {
+ if (start > 0)
+ start &= ~(s64)vol->cluster_size_mask;
+ new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
+ ~(s64)vol->cluster_size_mask;
+ }
+ BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
+ /* Check if new size is allowed in $AttrDef. */
+ err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
+ if (unlikely(err)) {
+ /* Only emit errors when the write will fail completely. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size) {
+ if (err == -ERANGE) {
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because the new "
+ "allocation would exceed the "
+ "maximum allowed size for "
+ "this attribute type.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ } else {
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because this "
+ "attribute type is not "
+ "defined on the NTFS volume. "
+ "Possible corruption! You "
+ "should run chkdsk!",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ }
+ }
+ /* Translate error code to be POSIX conformant for write(2). */
+ if (err == -ERANGE)
+ err = -EFBIG;
+ else
+ err = -EIO;
+ return err;
+ }
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /*
+ * We will be modifying both the runlist (if non-resident) and the mft
+ * record so lock them both down.
+ */
+ down_write(&ni->runlist.lock);
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * If non-resident, seek to the last extent. If resident, there is
+ * only one extent, so seek to that.
+ */
+ vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
+ 0;
+ /*
+ * Abort if someone did the work whilst we waited for the locks. If we
+ * just converted the attribute from resident to non-resident it is
+ * likely that exactly this has happened already. We cannot quite
+ * abort if we need to update the data size.
+ */
+ if (unlikely(new_alloc_size <= allocated_size)) {
+ ntfs_debug("Allocated size already exceeds requested size.");
+ new_alloc_size = allocated_size;
+ if (new_data_size < 0)
+ goto done;
+ /*
+ * We want the first attribute extent so that we can update the
+ * data size.
+ */
+ vcn = 0;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, vcn, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /* Use goto to reduce indentation. */
+ if (a->non_resident)
+ goto do_non_resident_extend;
+ BUG_ON(NInoNonResident(ni));
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ /*
+ * Extend the attribute record to be able to store the new attribute
+ * size. ntfs_attr_record_resize() will not do anything if the size is
+ * not changing.
+ */
+ if (new_alloc_size < vol->mft_record_size &&
+ !ntfs_attr_record_resize(m, a,
+ le16_to_cpu(a->data.resident.value_offset) +
+ new_alloc_size)) {
+ /* The resize succeeded! */
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ if (new_data_size >= 0) {
+ BUG_ON(new_data_size < attr_len);
+ a->data.resident.value_length =
+ cpu_to_le32((u32)new_data_size);
+ }
+ goto flush_done;
+ }
+ /*
+ * We have to drop all the locks so we can call
+ * ntfs_attr_make_non_resident(). This could be optimised by try-
+ * locking the first page cache page and only if that fails dropping
+ * the locks, locking the page, and redoing all the locking and
+ * lookups. While this would be a huge optimisation, it is not worth
+ * it as this is definitely a slow code path.
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * Not enough space in the mft record, try to make the attribute
+ * non-resident and if successful restart the extension process.
+ */
+ err = ntfs_attr_make_non_resident(ni, attr_len);
+ if (likely(!err))
+ goto retry_extend;
+ /*
+ * Could not make non-resident. If this is due to this not being
+ * permitted for this attribute type or there not being enough space,
+ * try to make other attributes non-resident. Otherwise fail.
+ */
+ if (unlikely(err != -EPERM && err != -ENOSPC)) {
+ /* Only emit errors when the write will fail completely. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the conversion from resident "
+ "to non-resident attribute failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto conv_err_out;
+ }
+ /* TODO: Not implemented from here, abort. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size) {
+ if (err == -ENOSPC)
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record/on disk for the non-resident "
+ "attribute value. This case is not "
+ "implemented yet.");
+ else /* if (err == -EPERM) */
+ ntfs_error(vol->sb, "This attribute type may not be "
+ "non-resident. This case is not "
+ "implemented yet.");
+ }
+ err = -EOPNOTSUPP;
+ goto conv_err_out;
+#if 0
+ // TODO: Attempt to make other attributes non-resident.
+ if (!err)
+ goto do_resident_extend;
+ /*
+ * Both the attribute list attribute and the standard information
+ * attribute must remain in the base inode. Thus, if this is one of
+ * these attributes, we have to try to move other attributes out into
+ * extent mft records instead.
+ */
+ if (ni->type == AT_ATTRIBUTE_LIST ||
+ ni->type == AT_STANDARD_INFORMATION) {
+ // TODO: Attempt to move other attributes into extent mft
+ // records.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ goto err_out;
+ }
+ // TODO: Attempt to move this attribute to an extent mft record, but
+ // only if it is not already the only attribute in an mft record in
+ // which case there would be nothing to gain.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ /* There is nothing we can do to make enough space. )-: */
+ goto err_out;
+#endif
+do_non_resident_extend:
+ BUG_ON(!NInoNonResident(ni));
+ if (new_alloc_size == allocated_size) {
+ BUG_ON(vcn);
+ goto alloc_done;
+ }
+ /*
+ * If the data starts after the end of the old allocation, this is a
+ * $DATA attribute and sparse attributes are enabled on the volume and
+ * for this inode, then create a sparse region between the old
+ * allocated size and the start of the data. Otherwise simply proceed
+ * with filling the whole space between the old allocated size and the
+ * new allocated size with clusters.
+ */
+ if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
+ !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
+ goto skip_sparse;
+ // TODO: This is not implemented yet. We just fill in with real
+ // clusters for now...
+ ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
+ "allocating real clusters instead.");
+skip_sparse:
+ rl = ni->runlist.rl;
+ if (likely(rl)) {
+ /* Seek to the end of the runlist. */
+ while (rl->length)
+ rl++;
+ }
+ /* If this attribute extent is not mapped, map it now. */
+ if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
+ (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
+ (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
+ if (!rl && !allocated_size)
+ goto first_alloc;
+ rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because the "
+ "mapping of a runlist "
+ "fragment failed with error "
+ "code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto err_out;
+ }
+ ni->runlist.rl = rl;
+ /* Seek to the end of the runlist. */
+ while (rl->length)
+ rl++;
+ }
+ /*
+ * We now know the runlist of the last extent is mapped and @rl is at
+ * the end of the runlist. We want to begin allocating clusters
+ * starting at the last allocated cluster to reduce fragmentation. If
+ * there are no valid LCNs in the attribute we let the cluster
+ * allocator choose the starting cluster.
+ */
+ /* If the last LCN is a hole or simillar seek back to last real LCN. */
+ while (rl->lcn < 0 && rl > ni->runlist.rl)
+ rl--;
+first_alloc:
+ // FIXME: Need to implement partial allocations so at least part of the
+ // write can be performed when start >= 0. (Needed for POSIX write(2)
+ // conformance.)
+ rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
+ (new_alloc_size - allocated_size) >>
+ vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
+ rl->lcn + rl->length : -1, DATA_ZONE, TRUE);
+ if (IS_ERR(rl2)) {
+ err = PTR_ERR(rl2);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the allocation of clusters "
+ "failed with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM && err != -ENOSPC)
+ err = -EIO;
+ goto err_out;
+ }
+ rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the runlist merge failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to release allocated "
+ "cluster(s) in error code path. Run "
+ "chkdsk to recover the lost "
+ "cluster(s).");
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ goto err_out;
+ }
+ ni->runlist.rl = rl;
+ ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
+ allocated_size) >> vol->cluster_size_bits);
+ /* Find the runlist element with which the attribute extent starts. */
+ ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
+ BUG_ON(!rl2);
+ BUG_ON(!rl2->length);
+ BUG_ON(rl2->lcn < LCN_HOLE);
+ mp_rebuilt = FALSE;
+ /* Get the size for the new mapping pairs array for this extent. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+ if (unlikely(mp_size <= 0)) {
+ err = mp_size;
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because determining the size for the "
+ "mapping pairs failed with error code "
+ "%i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ goto undo_alloc;
+ }
+ /* Extend the attribute record to fit the bigger mapping pairs array. */
+ attr_len = le32_to_cpu(a->length);
+ err = ntfs_attr_record_resize(m, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(err)) {
+ BUG_ON(err != -ENOSPC);
+ // TODO: Deal with this by moving this extent to a new mft
+ // record or by starting a new extent in a new mft record,
+ // possibly by extending this extent partially and filling it
+ // and creating a new extent for the remainder, or by making
+ // other attributes non-resident and/or by moving other
+ // attributes out of this mft record.
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record for the extended attribute "
+ "record. This case is not "
+ "implemented yet.");
+ err = -EOPNOTSUPP;
+ goto undo_alloc;
+ }
+ mp_rebuilt = TRUE;
+ /* Generate the mapping pairs array directly into the attr record. */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, ll, -1, NULL);
+ if (unlikely(err)) {
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because building the mapping pairs "
+ "failed with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ goto undo_alloc;
+ }
+ /* Update the highest_vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+ vol->cluster_size_bits) - 1);
+ /*
+ * We now have extended the allocated size of the attribute. Reflect
+ * this in the ntfs_inode structure and the attribute record.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ /*
+ * We are not in the first attribute extent, switch to it, but
+ * first ensure the changes will make it to disk later.
+ */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err))
+ goto restore_undo_alloc;
+ /* @m is not used any more so no need to set it. */
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+ /*
+ * FIXME: This would fail if @ni is a directory, $MFT, or an index,
+ * since those can have sparse/compressed set. For example can be
+ * set compressed even though it is not compressed itself and in that
+ * case the bit means that files are to be created compressed in the
+ * directory... At present this is ok as this code is only called for
+ * regular files, and only for their $DATA attribute(s).
+ * FIXME: The calculation is wrong if we created a hole above. For now
+ * it does not matter as we never create holes.
+ */
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ ni->itype.compressed.size += new_alloc_size - allocated_size;
+ a->data.non_resident.compressed_size =
+ cpu_to_sle64(ni->itype.compressed.size);
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+alloc_done:
+ if (new_data_size >= 0) {
+ BUG_ON(new_data_size <
+ sle64_to_cpu(a->data.non_resident.data_size));
+ a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
+ }
+flush_done:
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+done:
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ ntfs_debug("Done, new_allocated_size 0x%llx.",
+ (unsigned long long)new_alloc_size);
+ return new_alloc_size;
+restore_undo_alloc:
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot complete extension of allocation "
+ "of inode 0x%lx, attribute type 0x%x, because "
+ "lookup of first attribute extent failed with "
+ "error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err == -ENOENT)
+ err = -EIO;
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
+ allocated_size >> vol->cluster_size_bits, NULL, 0,
+ ctx)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "attribute in error code path. Run chkdsk to "
+ "recover.");
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ /*
+ * FIXME: This would fail if @ni is a directory... See above.
+ * FIXME: The calculation is wrong if we created a hole above.
+ * For now it does not matter as we never create holes.
+ */
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ ni->itype.compressed.size += new_alloc_size -
+ allocated_size;
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * The only thing that is now wrong is the allocated size of the
+ * base attribute extent which chkdsk should be able to fix.
+ */
+ NVolSetErrors(vol);
+ return err;
+ }
+ ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
+ (allocated_size >> vol->cluster_size_bits) - 1);
+undo_alloc:
+ ll = allocated_size >> vol->cluster_size_bits;
+ if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
+ ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
+ "in error code path. Run chkdsk to recover "
+ "the lost cluster(s).");
+ NVolSetErrors(vol);
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /*
+ * If the runlist truncation fails and/or the search context is no
+ * longer valid, we cannot resize the attribute record or build the
+ * mapping pairs array thus we mark the inode bad so that no access to
+ * the freed clusters can happen.
+ */
+ if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
+ ntfs_error(vol->sb, "Failed to %s in error code path. Run "
+ "chkdsk to recover.", IS_ERR(m) ?
+ "restore attribute search context" :
+ "truncate attribute runlist");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ } else if (mp_rebuilt) {
+ if (ntfs_attr_record_resize(m, a, attr_len)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record in error code path. Run "
+ "chkdsk to recover.");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.
+ mapping_pairs_offset), attr_len -
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), rl2, ll, -1,
+ NULL)) {
+ ntfs_error(vol->sb, "Failed to restore "
+ "mapping pairs array in error "
+ "code path. Run chkdsk to "
+ "recover.");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ }
+ }
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+conv_err_out:
+ ntfs_debug("Failed. Returning error code %i.", err);
+ return err;
+}
+
+/**
* ntfs_attr_set - fill (a part of) an attribute with a byte
* @ni: ntfs inode describing the attribute to fill
* @ofs: offset inside the attribute at which to start to fill
@@ -1773,6 +2592,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
/* Finally unlock and release the page. */
unlock_page(page);
page_cache_release(page);
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
}
/* If there is a last partial page, need to do it the slow way. */
if (end_ofs) {
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0618ed6fd7b..9074886b44b 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -60,14 +60,15 @@ typedef struct {
ATTR_RECORD *base_attr;
} ntfs_attr_search_ctx;
-extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn);
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
+ ntfs_attr_search_ctx *ctx);
extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
const BOOL write_locked);
extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
- const VCN vcn, const BOOL write_locked);
+ const VCN vcn, ntfs_attr_search_ctx *ctx);
int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
const u32 name_len, const IGNORE_CASE_BOOL ic,
@@ -102,7 +103,10 @@ extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
const u32 new_size);
-extern int ntfs_attr_make_non_resident(ntfs_inode *ni);
+extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
+
+extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+ const s64 new_data_size, const s64 data_start);
extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
const u8 val);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423..cf3e6ced2d0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/pagemap.h>
#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+
+#include "attrib.h"
+#include "bitmap.h"
#include "inode.h"
#include "debug.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
#include "ntfs.h"
/**
@@ -56,6 +69,2184 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
#ifdef NTFS_RW
/**
+ * ntfs_attr_extend_initialized - extend the initialized size of an attribute
+ * @ni: ntfs inode of the attribute to extend
+ * @new_init_size: requested new initialized size in bytes
+ * @cached_page: store any allocated but unused page here
+ * @lru_pvec: lru-buffering pagevec of the caller
+ *
+ * Extend the initialized size of an attribute described by the ntfs inode @ni
+ * to @new_init_size bytes. This involves zeroing any non-sparse space between
+ * the old initialized size and @new_init_size both in the page cache and on
+ * disk (if relevant complete pages are already uptodate in the page cache then
+ * these are simply marked dirty).
+ *
+ * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
+ * in the resident attribute case, it is tied to the initialized size and, in
+ * the non-resident attribute case, it may not fall below the initialized size.
+ *
+ * Note that if the attribute is resident, we do not need to touch the page
+ * cache at all. This is because if the page cache page is not uptodate we
+ * bring it uptodate later, when doing the write to the mft record since we
+ * then already have the page mapped. And if the page is uptodate, the
+ * non-initialized region will already have been zeroed when the page was
+ * brought uptodate and the region may in fact already have been overwritten
+ * with new data via mmap() based writes, so we cannot just zero it. And since
+ * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
+ * is unspecified, we choose not to do zeroing and thus we do not need to touch
+ * the page at all. For a more detailed explanation see ntfs_truncate() in
+ * fs/ntfs/inode.c.
+ *
+ * @cached_page and @lru_pvec are just optimizations for dealing with multiple
+ * pages.
+ *
+ * Return 0 on success and -errno on error. In the case that an error is
+ * encountered it is possible that the initialized size will already have been
+ * incremented some way towards @new_init_size but it is guaranteed that if
+ * this is the case, the necessary zeroing will also have happened and that all
+ * metadata is self-consistent.
+ *
+ * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
+ * held by the caller.
+ */
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
+ struct page **cached_page, struct pagevec *lru_pvec)
+{
+ s64 old_init_size;
+ loff_t old_i_size;
+ pgoff_t index, end_index;
+ unsigned long flags;
+ struct inode *vi = VFS_I(ni);
+ ntfs_inode *base_ni;
+ MFT_RECORD *m = NULL;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx = NULL;
+ struct address_space *mapping;
+ struct page *page = NULL;
+ u8 *kattr;
+ int err;
+ u32 attr_len;
+
+ read_lock_irqsave(&ni->size_lock, flags);
+ old_init_size = ni->initialized_size;
+ old_i_size = i_size_read(vi);
+ BUG_ON(new_init_size > ni->allocated_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+ "old_initialized_size 0x%llx, "
+ "new_initialized_size 0x%llx, i_size 0x%llx.",
+ vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)old_init_size,
+ (unsigned long long)new_init_size, old_i_size);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Use goto to reduce indentation and we need the label below anyway. */
+ if (NInoNonResident(ni))
+ goto do_non_resident_extend;
+ BUG_ON(old_init_size != old_i_size);
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(a->non_resident);
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ BUG_ON(old_i_size != (loff_t)attr_len);
+ /*
+ * Do the zeroing in the mft record and update the attribute size in
+ * the mft record.
+ */
+ kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+ memset(kattr + attr_len, 0, new_init_size - attr_len);
+ a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
+ /* Finally, update the sizes in the vfs and ntfs inodes. */
+ write_lock_irqsave(&ni->size_lock, flags);
+ i_size_write(vi, new_init_size);
+ ni->initialized_size = new_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ goto done;
+do_non_resident_extend:
+ /*
+ * If the new initialized size @new_init_size exceeds the current file
+ * size (vfs inode->i_size), we need to extend the file size to the
+ * new initialized size.
+ */
+ if (new_init_size > old_i_size) {
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ BUG_ON(old_i_size != (loff_t)
+ sle64_to_cpu(a->data.non_resident.data_size));
+ a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /* Update the file size in the vfs inode. */
+ i_size_write(vi, new_init_size);
+ ntfs_attr_put_search_ctx(ctx);
+ ctx = NULL;
+ unmap_mft_record(base_ni);
+ m = NULL;
+ }
+ mapping = vi->i_mapping;
+ index = old_init_size >> PAGE_CACHE_SHIFT;
+ end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ do {
+ /*
+ * Read the page. If the page is not present, this will zero
+ * the uninitialized regions for us.
+ */
+ page = read_cache_page(mapping, index,
+ (filler_t*)mapping->a_ops->readpage, NULL);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto init_err_out;
+ }
+ wait_on_page_locked(page);
+ if (unlikely(!PageUptodate(page) || PageError(page))) {
+ page_cache_release(page);
+ err = -EIO;
+ goto init_err_out;
+ }
+ /*
+ * Update the initialized size in the ntfs inode. This is
+ * enough to make ntfs_writepage() work.
+ */
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
+ if (ni->initialized_size > new_init_size)
+ ni->initialized_size = new_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /* Set the page dirty so it gets written out. */
+ set_page_dirty(page);
+ page_cache_release(page);
+ /*
+ * Play nice with the vm and the rest of the system. This is
+ * very much needed as we can potentially be modifying the
+ * initialised size from a very small value to a really huge
+ * value, e.g.
+ * f = open(somefile, O_TRUNC);
+ * truncate(f, 10GiB);
+ * seek(f, 10GiB);
+ * write(f, 1);
+ * And this would mean we would be marking dirty hundreds of
+ * thousands of pages or as in the above example more than
+ * two and a half million pages!
+ *
+ * TODO: For sparse pages could optimize this workload by using
+ * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
+ * would be set in readpage for sparse pages and here we would
+ * not need to mark dirty any pages which have this bit set.
+ * The only caveat is that we have to clear the bit everywhere
+ * where we allocate any clusters that lie in the page or that
+ * contain the page.
+ *
+ * TODO: An even greater optimization would be for us to only
+ * call readpage() on pages which are not in sparse regions as
+ * determined from the runlist. This would greatly reduce the
+ * number of pages we read and make dirty in the case of sparse
+ * files.
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (++index < end_index);
+ read_lock_irqsave(&ni->size_lock, flags);
+ BUG_ON(ni->initialized_size != new_init_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Now bring in sync the initialized_size in the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto init_err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto init_err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto init_err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
+done:
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
+ (unsigned long long)new_init_size, i_size_read(vi));
+ return 0;
+init_err_out:
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = old_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_debug("Failed. Returning error code %i.", err);
+ return err;
+}
+
+/**
+ * ntfs_fault_in_pages_readable -
+ *
+ * Fault a number of userspace pages into pagetables.
+ *
+ * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
+ * with more than two userspace pages as well as handling the single page case
+ * elegantly.
+ *
+ * If you find this difficult to understand, then think of the while loop being
+ * the following code, except that we do without the integer variable ret:
+ *
+ * do {
+ * ret = __get_user(c, uaddr);
+ * uaddr += PAGE_SIZE;
+ * } while (!ret && uaddr < end);
+ *
+ * Note, the final __get_user() may well run out-of-bounds of the user buffer,
+ * but _not_ out-of-bounds of the page the user buffer belongs to, and since
+ * this is only a read and not a write, and since it is still in the same page,
+ * it should not matter and this makes the code much simpler.
+ */
+static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
+ int bytes)
+{
+ const char __user *end;
+ volatile char c;
+
+ /* Set @end to the first byte outside the last page we care about. */
+ end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
+
+ while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
+ ;
+}
+
+/**
+ * ntfs_fault_in_pages_readable_iovec -
+ *
+ * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
+ */
+static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
+ size_t iov_ofs, int bytes)
+{
+ do {
+ const char __user *buf;
+ unsigned len;
+
+ buf = iov->iov_base + iov_ofs;
+ len = iov->iov_len - iov_ofs;
+ if (len > bytes)
+ len = bytes;
+ ntfs_fault_in_pages_readable(buf, len);
+ bytes -= len;
+ iov++;
+ iov_ofs = 0;
+ } while (bytes);
+}
+
+/**
+ * __ntfs_grab_cache_pages - obtain a number of locked pages
+ * @mapping: address space mapping from which to obtain page cache pages
+ * @index: starting index in @mapping at which to begin obtaining pages
+ * @nr_pages: number of page cache pages to obtain
+ * @pages: array of pages in which to return the obtained page cache pages
+ * @cached_page: allocated but as yet unused page
+ * @lru_pvec: lru-buffering pagevec of caller
+ *
+ * Obtain @nr_pages locked page cache pages from the mapping @maping and
+ * starting at index @index.
+ *
+ * If a page is newly created, increment its refcount and add it to the
+ * caller's lru-buffering pagevec @lru_pvec.
+ *
+ * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
+ * are obtained at once instead of just one page and that 0 is returned on
+ * success and -errno on error.
+ *
+ * Note, the page locks are obtained in ascending page index order.
+ */
+static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
+ pgoff_t index, const unsigned nr_pages, struct page **pages,
+ struct page **cached_page, struct pagevec *lru_pvec)
+{
+ int err, nr;
+
+ BUG_ON(!nr_pages);
+ err = nr = 0;
+ do {
+ pages[nr] = find_lock_page(mapping, index);
+ if (!pages[nr]) {
+ if (!*cached_page) {
+ *cached_page = page_cache_alloc(mapping);
+ if (unlikely(!*cached_page)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ err = add_to_page_cache(*cached_page, mapping, index,
+ GFP_KERNEL);
+ if (unlikely(err)) {
+ if (err == -EEXIST)
+ continue;
+ goto err_out;
+ }
+ pages[nr] = *cached_page;
+ page_cache_get(*cached_page);
+ if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
+ __pagevec_lru_add(lru_pvec);
+ *cached_page = NULL;
+ }
+ index++;
+ nr++;
+ } while (nr < nr_pages);
+out:
+ return err;
+err_out:
+ while (nr > 0) {
+ unlock_page(pages[--nr]);
+ page_cache_release(pages[nr]);
+ }
+ goto out;
+}
+
+static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+{
+ lock_buffer(bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ return submit_bh(READ, bh);
+}
+
+/**
+ * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * This is called for non-resident attributes from ntfs_file_buffered_write()
+ * with i_sem held on the inode (@pages[0]->mapping->host). There are
+ * @nr_pages pages in @pages which are locked but not kmap()ped. The source
+ * data has not yet been copied into the @pages.
+ *
+ * Need to fill any holes with actual clusters, allocate buffers if necessary,
+ * ensure all the buffers are mapped, and bring uptodate any buffers that are
+ * only partially being written to.
+ *
+ * If @nr_pages is greater than one, we are guaranteed that the cluster size is
+ * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * the same cluster and that they are the entirety of that cluster, and that
+ * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
+ unsigned nr_pages, s64 pos, size_t bytes)
+{
+ VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
+ LCN lcn;
+ s64 bh_pos, vcn_len, end, initialized_size;
+ sector_t lcn_block;
+ struct page *page;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni = NULL;
+ ntfs_volume *vol;
+ runlist_element *rl, *rl2;
+ struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+ ntfs_attr_search_ctx *ctx = NULL;
+ MFT_RECORD *m = NULL;
+ ATTR_RECORD *a = NULL;
+ unsigned long flags;
+ u32 attr_rec_len = 0;
+ unsigned blocksize, u;
+ int err, mp_size;
+ BOOL rl_write_locked, was_hole, is_retry;
+ unsigned char blocksize_bits;
+ struct {
+ u8 runlist_merged:1;
+ u8 mft_attr_mapped:1;
+ u8 mp_rebuilt:1;
+ u8 attr_switched:1;
+ } status = { 0, 0, 0, 0 };
+
+ BUG_ON(!nr_pages);
+ BUG_ON(!pages);
+ BUG_ON(!*pages);
+ vi = pages[0]->mapping->host;
+ ni = NTFS_I(vi);
+ vol = ni->vol;
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+ "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+ vi->i_ino, ni->type, pages[0]->index, nr_pages,
+ (long long)pos, bytes);
+ blocksize_bits = vi->i_blkbits;
+ blocksize = 1 << blocksize_bits;
+ u = 0;
+ do {
+ struct page *page = pages[u];
+ /*
+ * create_empty_buffers() will create uptodate/dirty buffers if
+ * the page is uptodate/dirty.
+ */
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, blocksize, 0);
+ if (unlikely(!page_has_buffers(page)))
+ return -ENOMEM;
+ }
+ } while (++u < nr_pages);
+ rl_write_locked = FALSE;
+ rl = NULL;
+ err = 0;
+ vcn = lcn = -1;
+ vcn_len = 0;
+ lcn_block = -1;
+ was_hole = FALSE;
+ cpos = pos >> vol->cluster_size_bits;
+ end = pos + bytes;
+ cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
+ /*
+ * Loop over each page and for each page over each buffer. Use goto to
+ * reduce indentation.
+ */
+ u = 0;
+do_next_page:
+ page = pages[u];
+ bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh = head = page_buffers(page);
+ do {
+ VCN cdelta;
+ s64 bh_end;
+ unsigned bh_cofs;
+
+ /* Clear buffer_new on all buffers to reinitialise state. */
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ bh_end = bh_pos + blocksize;
+ bh_cpos = bh_pos >> vol->cluster_size_bits;
+ bh_cofs = bh_pos & vol->cluster_size_mask;
+ if (buffer_mapped(bh)) {
+ /*
+ * The buffer is already mapped. If it is uptodate,
+ * ignore it.
+ */
+ if (buffer_uptodate(bh))
+ continue;
+ /*
+ * The buffer is not uptodate. If the page is uptodate
+ * set the buffer uptodate and otherwise ignore it.
+ */
+ if (PageUptodate(page)) {
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ /*
+ * Neither the page nor the buffer are uptodate. If
+ * the buffer is only partially being written to, we
+ * need to read it in before the write, i.e. now.
+ */
+ if ((bh_pos < pos && bh_end > pos) ||
+ (bh_pos < end && bh_end > end)) {
+ /*
+ * If the buffer is fully or partially within
+ * the initialized size, do an actual read.
+ * Otherwise, simply zero the buffer.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (bh_pos < initialized_size) {
+ ntfs_submit_bh_for_read(bh);
+ *wait_bh++ = bh;
+ } else {
+ u8 *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh), 0,
+ blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ }
+ continue;
+ }
+ /* Unmapped buffer. Need to map it. */
+ bh->b_bdev = vol->sb->s_bdev;
+ /*
+ * If the current buffer is in the same clusters as the map
+ * cache, there is no need to check the runlist again. The
+ * map cache is made up of @vcn, which is the first cached file
+ * cluster, @vcn_len which is the number of cached file
+ * clusters, @lcn is the device cluster corresponding to @vcn,
+ * and @lcn_block is the block number corresponding to @lcn.
+ */
+ cdelta = bh_cpos - vcn;
+ if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
+map_buffer_cached:
+ BUG_ON(lcn < 0);
+ bh->b_blocknr = lcn_block +
+ (cdelta << (vol->cluster_size_bits -
+ blocksize_bits)) +
+ (bh_cofs >> blocksize_bits);
+ set_buffer_mapped(bh);
+ /*
+ * If the page is uptodate so is the buffer. If the
+ * buffer is fully outside the write, we ignore it if
+ * it was already allocated and we mark it dirty so it
+ * gets written out if we allocated it. On the other
+ * hand, if we allocated the buffer but we are not
+ * marking it dirty we set buffer_new so we can do
+ * error recovery.
+ */
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ if (unlikely(was_hole)) {
+ /* We allocated the buffer. */
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ if (bh_end <= pos || bh_pos >= end)
+ mark_buffer_dirty(bh);
+ else
+ set_buffer_new(bh);
+ }
+ continue;
+ }
+ /* Page is _not_ uptodate. */
+ if (likely(!was_hole)) {
+ /*
+ * Buffer was already allocated. If it is not
+ * uptodate and is only partially being written
+ * to, we need to read it in before the write,
+ * i.e. now.
+ */
+ if (!buffer_uptodate(bh) && ((bh_pos < pos &&
+ bh_end > pos) ||
+ (bh_end > end &&
+ bh_end > end))) {
+ /*
+ * If the buffer is fully or partially
+ * within the initialized size, do an
+ * actual read. Otherwise, simply zero
+ * the buffer.
+ */
+ read_lock_irqsave(&ni->size_lock,
+ flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock,
+ flags);
+ if (bh_pos < initialized_size) {
+ ntfs_submit_bh_for_read(bh);
+ *wait_bh++ = bh;
+ } else {
+ u8 *kaddr = kmap_atomic(page,
+ KM_USER0);
+ memset(kaddr + bh_offset(bh),
+ 0, blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ }
+ continue;
+ }
+ /* We allocated the buffer. */
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ /*
+ * If the buffer is fully outside the write, zero it,
+ * set it uptodate, and mark it dirty so it gets
+ * written out. If it is partially being written to,
+ * zero region surrounding the write but leave it to
+ * commit write to do anything else. Finally, if the
+ * buffer is fully being overwritten, do nothing.
+ */
+ if (bh_end <= pos || bh_pos >= end) {
+ if (!buffer_uptodate(bh)) {
+ u8 *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh), 0,
+ blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ mark_buffer_dirty(bh);
+ continue;
+ }
+ set_buffer_new(bh);
+ if (!buffer_uptodate(bh) &&
+ (bh_pos < pos || bh_end > end)) {
+ u8 *kaddr;
+ unsigned pofs;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (bh_pos < pos) {
+ pofs = bh_pos & ~PAGE_CACHE_MASK;
+ memset(kaddr + pofs, 0, pos - bh_pos);
+ }
+ if (bh_end > end) {
+ pofs = end & ~PAGE_CACHE_MASK;
+ memset(kaddr + pofs, 0, bh_end - end);
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ }
+ continue;
+ }
+ /*
+ * Slow path: this is the first buffer in the cluster. If it
+ * is outside allocated size and is not uptodate, zero it and
+ * set it uptodate.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (bh_pos > initialized_size) {
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh)) {
+ u8 *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh), 0, blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ is_retry = FALSE;
+ if (!rl) {
+ down_read(&ni->runlist.lock);
+retry_remap:
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target cluster. */
+ while (rl->length && rl[1].vcn <= bh_cpos)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
+ if (likely(lcn >= 0)) {
+ /*
+ * Successful remap, setup the map cache and
+ * use that to deal with the buffer.
+ */
+ was_hole = FALSE;
+ vcn = bh_cpos;
+ vcn_len = rl[1].vcn - vcn;
+ lcn_block = lcn << (vol->cluster_size_bits -
+ blocksize_bits);
+ cdelta = 0;
+ /*
+ * If the number of remaining clusters in the
+ * @pages is smaller or equal to the number of
+ * cached clusters, unlock the runlist as the
+ * map cache will be used from now on.
+ */
+ if (likely(vcn + vcn_len >= cend)) {
+ if (rl_write_locked) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = FALSE;
+ } else
+ up_read(&ni->runlist.lock);
+ rl = NULL;
+ }
+ goto map_buffer_cached;
+ }
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /*
+ * If it is not a hole and not out of bounds, the runlist is
+ * probably unmapped so try to map it now.
+ */
+ if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
+ if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
+ /* Attempt to map runlist. */
+ if (!rl_write_locked) {
+ /*
+ * We need the runlist locked for
+ * writing, so if it is locked for
+ * reading relock it now and retry in
+ * case it changed whilst we dropped
+ * the lock.
+ */
+ up_read(&ni->runlist.lock);
+ down_write(&ni->runlist.lock);
+ rl_write_locked = TRUE;
+ goto retry_remap;
+ }
+ err = ntfs_map_runlist_nolock(ni, bh_cpos,
+ NULL);
+ if (likely(!err)) {
+ is_retry = TRUE;
+ goto retry_remap;
+ }
+ /*
+ * If @vcn is out of bounds, pretend @lcn is
+ * LCN_ENOENT. As long as the buffer is out
+ * of bounds this will work fine.
+ */
+ if (err == -ENOENT) {
+ lcn = LCN_ENOENT;
+ err = 0;
+ goto rl_not_mapped_enoent;
+ }
+ } else
+ err = -EIO;
+ /* Failed to map the buffer, even after retrying. */
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+ "attribute type 0x%x, vcn 0x%llx, "
+ "vcn offset 0x%x, because its "
+ "location on disk could not be "
+ "determined%s (error code %i).",
+ ni->mft_no, ni->type,
+ (unsigned long long)bh_cpos,
+ (unsigned)bh_pos &
+ vol->cluster_size_mask,
+ is_retry ? " even after retrying" : "",
+ err);
+ break;
+ }
+rl_not_mapped_enoent:
+ /*
+ * The buffer is in a hole or out of bounds. We need to fill
+ * the hole, unless the buffer is in a cluster which is not
+ * touched by the write, in which case we just leave the buffer
+ * unmapped. This can only happen when the cluster size is
+ * less than the page cache size.
+ */
+ if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+ bh_cend = (bh_end + vol->cluster_size - 1) >>
+ vol->cluster_size_bits;
+ if ((bh_cend <= cpos || bh_cpos >= cend)) {
+ bh->b_blocknr = -1;
+ /*
+ * If the buffer is uptodate we skip it. If it
+ * is not but the page is uptodate, we can set
+ * the buffer uptodate. If the page is not
+ * uptodate, we can clear the buffer and set it
+ * uptodate. Whether this is worthwhile is
+ * debatable and this could be removed.
+ */
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh)) {
+ u8 *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh), 0,
+ blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ }
+ /*
+ * Out of bounds buffer is invalid if it was not really out of
+ * bounds.
+ */
+ BUG_ON(lcn != LCN_HOLE);
+ /*
+ * We need the runlist locked for writing, so if it is locked
+ * for reading relock it now and retry in case it changed
+ * whilst we dropped the lock.
+ */
+ BUG_ON(!rl);
+ if (!rl_write_locked) {
+ up_read(&ni->runlist.lock);
+ down_write(&ni->runlist.lock);
+ rl_write_locked = TRUE;
+ goto retry_remap;
+ }
+ /* Find the previous last allocated cluster. */
+ BUG_ON(rl->lcn != LCN_HOLE);
+ lcn = -1;
+ rl2 = rl;
+ while (--rl2 >= ni->runlist.rl) {
+ if (rl2->lcn >= 0) {
+ lcn = rl2->lcn + rl2->length;
+ break;
+ }
+ }
+ rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
+ FALSE);
+ if (IS_ERR(rl2)) {
+ err = PTR_ERR(rl2);
+ ntfs_debug("Failed to allocate cluster, error code %i.",
+ err);
+ break;
+ }
+ lcn = rl2->lcn;
+ rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (err != -ENOMEM)
+ err = -EIO;
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to release "
+ "allocated cluster in error "
+ "code path. Run chkdsk to "
+ "recover the lost cluster.");
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ break;
+ }
+ ni->runlist.rl = rl;
+ status.runlist_merged = 1;
+ ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
+ /* Map and lock the mft record and get the attribute record. */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ break;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ unmap_mft_record(base_ni);
+ break;
+ }
+ status.mft_attr_mapped = 1;
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ break;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /*
+ * Find the runlist element with which the attribute extent
+ * starts. Note, we cannot use the _attr_ version because we
+ * have mapped the mft record. That is ok because we know the
+ * runlist fragment must be mapped already to have ever gotten
+ * here, so we can just use the _rl_ version.
+ */
+ vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
+ BUG_ON(!rl2);
+ BUG_ON(!rl2->length);
+ BUG_ON(rl2->lcn < LCN_HOLE);
+ highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ /*
+ * If @highest_vcn is zero, calculate the real highest_vcn
+ * (which can really be zero).
+ */
+ if (!highest_vcn)
+ highest_vcn = (sle64_to_cpu(
+ a->data.non_resident.allocated_size) >>
+ vol->cluster_size_bits) - 1;
+ /*
+ * Determine the size of the mapping pairs array for the new
+ * extent, i.e. the old extent with the hole filled.
+ */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
+ highest_vcn);
+ if (unlikely(mp_size <= 0)) {
+ if (!(err = mp_size))
+ err = -EIO;
+ ntfs_debug("Failed to get size for mapping pairs "
+ "array, error code %i.", err);
+ break;
+ }
+ /*
+ * Resize the attribute record to fit the new mapping pairs
+ * array.
+ */
+ attr_rec_len = le32_to_cpu(a->length);
+ err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(err)) {
+ BUG_ON(err != -ENOSPC);
+ // TODO: Deal with this by using the current attribute
+ // and fill it with as much of the mapping pairs
+ // array as possible. Then loop over each attribute
+ // extent rewriting the mapping pairs arrays as we go
+ // along and if when we reach the end we have not
+ // enough space, try to resize the last attribute
+ // extent and if even that fails, add a new attribute
+ // extent.
+ // We could also try to resize at each step in the hope
+ // that we will not need to rewrite every single extent.
+ // Note, we may need to decompress some extents to fill
+ // the runlist as we are walking the extents...
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record for the extended attribute "
+ "record. This case is not "
+ "implemented yet.");
+ err = -EOPNOTSUPP;
+ break ;
+ }
+ status.mp_rebuilt = 1;
+ /*
+ * Generate the mapping pairs array directly into the attribute
+ * record.
+ */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, vcn, highest_vcn, NULL);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
+ "attribute type 0x%x, because building "
+ "the mapping pairs failed with error "
+ "code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ break;
+ }
+ /* Update the highest_vcn but only if it was not set. */
+ if (unlikely(!a->data.non_resident.highest_vcn))
+ a->data.non_resident.highest_vcn =
+ cpu_to_sle64(highest_vcn);
+ /*
+ * If the attribute is sparse/compressed, update the compressed
+ * size in the ntfs_inode structure and the attribute record.
+ */
+ if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
+ /*
+ * If we are not in the first attribute extent, switch
+ * to it, but first ensure the changes will make it to
+ * disk later.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(ni->type, ni->name,
+ ni->name_len, CASE_SENSITIVE,
+ 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ status.attr_switched = 1;
+ break;
+ }
+ /* @m is not used any more so do not set it. */
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->itype.compressed.size += vol->cluster_size;
+ a->data.non_resident.compressed_size =
+ cpu_to_sle64(ni->itype.compressed.size);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ }
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ /* Successfully filled the hole. */
+ status.runlist_merged = 0;
+ status.mft_attr_mapped = 0;
+ status.mp_rebuilt = 0;
+ /* Setup the map cache and use that to deal with the buffer. */
+ was_hole = TRUE;
+ vcn = bh_cpos;
+ vcn_len = 1;
+ lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
+ cdelta = 0;
+ /*
+ * If the number of remaining clusters in the @pages is smaller
+ * or equal to the number of cached clusters, unlock the
+ * runlist as the map cache will be used from now on.
+ */
+ if (likely(vcn + vcn_len >= cend)) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = FALSE;
+ rl = NULL;
+ }
+ goto map_buffer_cached;
+ } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+ /* If there are no errors, do the next page. */
+ if (likely(!err && ++u < nr_pages))
+ goto do_next_page;
+ /* If there are no errors, release the runlist lock if we took it. */
+ if (likely(!err)) {
+ if (unlikely(rl_write_locked)) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = FALSE;
+ } else if (unlikely(rl))
+ up_read(&ni->runlist.lock);
+ rl = NULL;
+ }
+ /* If we issued read requests, let them complete. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ while (wait_bh > wait) {
+ bh = *--wait_bh;
+ wait_on_buffer(bh);
+ if (likely(buffer_uptodate(bh))) {
+ page = bh->b_page;
+ bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+ bh_offset(bh);
+ /*
+ * If the buffer overflows the initialized size, need
+ * to zero the overflowing region.
+ */
+ if (unlikely(bh_pos + blocksize > initialized_size)) {
+ u8 *kaddr;
+ int ofs = 0;
+
+ if (likely(bh_pos < initialized_size))
+ ofs = initialized_size - bh_pos;
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh) + ofs, 0,
+ blocksize - ofs);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ }
+ } else /* if (unlikely(!buffer_uptodate(bh))) */
+ err = -EIO;
+ }
+ if (likely(!err)) {
+ /* Clear buffer_new on all buffers. */
+ u = 0;
+ do {
+ bh = head = page_buffers(pages[u]);
+ do {
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ } while ((bh = bh->b_this_page) != head);
+ } while (++u < nr_pages);
+ ntfs_debug("Done.");
+ return err;
+ }
+ if (status.attr_switched) {
+ /* Get back to the attribute extent we modified. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
+ ntfs_error(vol->sb, "Failed to find required "
+ "attribute extent of attribute in "
+ "error code path. Run chkdsk to "
+ "recover.");
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->itype.compressed.size += vol->cluster_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /*
+ * The only thing that is now wrong is the compressed
+ * size of the base attribute extent which chkdsk
+ * should be able to fix.
+ */
+ NVolSetErrors(vol);
+ } else {
+ m = ctx->mrec;
+ a = ctx->attr;
+ status.attr_switched = 0;
+ }
+ }
+ /*
+ * If the runlist has been modified, need to restore it by punching a
+ * hole into it and we then need to deallocate the on-disk cluster as
+ * well. Note, we only modify the runlist if we are able to generate a
+ * new mapping pairs array, i.e. only when the mapped attribute extent
+ * is not switched.
+ */
+ if (status.runlist_merged && !status.attr_switched) {
+ BUG_ON(!rl_write_locked);
+ /* Make the file cluster we allocated sparse in the runlist. */
+ if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
+ ntfs_error(vol->sb, "Failed to punch hole into "
+ "attribute runlist in error code "
+ "path. Run chkdsk to recover the "
+ "lost cluster.");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ status.runlist_merged = 0;
+ /*
+ * Deallocate the on-disk cluster we allocated but only
+ * if we succeeded in punching its vcn out of the
+ * runlist.
+ */
+ down_write(&vol->lcnbmp_lock);
+ if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+ ntfs_error(vol->sb, "Failed to release "
+ "allocated cluster in error "
+ "code path. Run chkdsk to "
+ "recover the lost cluster.");
+ NVolSetErrors(vol);
+ }
+ up_write(&vol->lcnbmp_lock);
+ }
+ }
+ /*
+ * Resize the attribute record to its old size and rebuild the mapping
+ * pairs array. Note, we only can do this if the runlist has been
+ * restored to its old state which also implies that the mapped
+ * attribute extent is not switched.
+ */
+ if (status.mp_rebuilt && !status.runlist_merged) {
+ if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record in error code path. Run "
+ "chkdsk to recover.");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), attr_rec_len -
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), ni->runlist.rl,
+ vcn, highest_vcn, NULL)) {
+ ntfs_error(vol->sb, "Failed to restore "
+ "mapping pairs array in error "
+ "code path. Run chkdsk to "
+ "recover.");
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ }
+ }
+ /* Release the mft record and the attribute. */
+ if (status.mft_attr_mapped) {
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ }
+ /* Release the runlist lock. */
+ if (rl_write_locked)
+ up_write(&ni->runlist.lock);
+ else if (rl)
+ up_read(&ni->runlist.lock);
+ /*
+ * Zero out any newly allocated blocks to avoid exposing stale data.
+ * If BH_New is set, we know that the block was newly allocated above
+ * and that it has not been fully zeroed and marked dirty yet.
+ */
+ nr_pages = u;
+ u = 0;
+ end = bh_cpos << vol->cluster_size_bits;
+ do {
+ page = pages[u];
+ bh = head = page_buffers(page);
+ do {
+ if (u == nr_pages &&
+ ((s64)page->index << PAGE_CACHE_SHIFT) +
+ bh_offset(bh) >= end)
+ break;
+ if (!buffer_new(bh))
+ continue;
+ clear_buffer_new(bh);
+ if (!buffer_uptodate(bh)) {
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+ else {
+ u8 *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + bh_offset(bh), 0,
+ blocksize);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page);
+ set_buffer_uptodate(bh);
+ }
+ }
+ mark_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+ } while (++u <= nr_pages);
+ ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
+ return err;
+}
+
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static inline size_t ntfs_copy_from_user(struct page **pages,
+ unsigned nr_pages, unsigned ofs, const char __user *buf,
+ size_t bytes)
+{
+ struct page **last_page = pages + nr_pages;
+ char *kaddr;
+ size_t total = 0;
+ unsigned len;
+ int left;
+
+ do {
+ len = PAGE_CACHE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ kaddr = kmap_atomic(*pages, KM_USER0);
+ left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (unlikely(left)) {
+ /* Do it the slow way. */
+ kaddr = kmap(*pages);
+ left = __copy_from_user(kaddr + ofs, buf, len);
+ kunmap(*pages);
+ if (unlikely(left))
+ goto err_out;
+ }
+ total += len;
+ bytes -= len;
+ if (!bytes)
+ break;
+ buf += len;
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err_out:
+ total += len - left;
+ /* Zero the rest of the target like __copy_from_user(). */
+ while (++pages < last_page) {
+ bytes -= len;
+ if (!bytes)
+ break;
+ len = PAGE_CACHE_SIZE;
+ if (len > bytes)
+ len = bytes;
+ kaddr = kmap_atomic(*pages, KM_USER0);
+ memset(kaddr, 0, len);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ goto out;
+}
+
+static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+ const struct iovec *iov, size_t iov_ofs, size_t bytes)
+{
+ size_t total = 0;
+
+ while (1) {
+ const char __user *buf = iov->iov_base + iov_ofs;
+ unsigned len;
+ size_t left;
+
+ len = iov->iov_len - iov_ofs;
+ if (len > bytes)
+ len = bytes;
+ left = __copy_from_user_inatomic(vaddr, buf, len);
+ total += len;
+ bytes -= len;
+ vaddr += len;
+ if (unlikely(left)) {
+ /*
+ * Zero the rest of the target like __copy_from_user().
+ */
+ memset(vaddr, 0, bytes);
+ total -= left;
+ break;
+ }
+ if (!bytes)
+ break;
+ iov++;
+ iov_ofs = 0;
+ }
+ return total;
+}
+
+static inline void ntfs_set_next_iovec(const struct iovec **iovp,
+ size_t *iov_ofsp, size_t bytes)
+{
+ const struct iovec *iov = *iovp;
+ size_t iov_ofs = *iov_ofsp;
+
+ while (bytes) {
+ unsigned len;
+
+ len = iov->iov_len - iov_ofs;
+ if (len > bytes)
+ len = bytes;
+ bytes -= len;
+ iov_ofs += len;
+ if (iov->iov_len == iov_ofs) {
+ iov++;
+ iov_ofs = 0;
+ }
+ }
+ *iovp = iov;
+ *iov_ofsp = iov_ofs;
+}
+
+/*
+ * This has the same side-effects and return value as ntfs_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
+ * single-segment behaviour.
+ *
+ * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
+ * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
+ * fact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep(). And on many
+ * architectures __copy_from_user_inatomic() is just defined to
+ * __copy_from_user() so it makes no difference at all on those architectures.
+ */
+static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
+ unsigned nr_pages, unsigned ofs, const struct iovec **iov,
+ size_t *iov_ofs, size_t bytes)
+{
+ struct page **last_page = pages + nr_pages;
+ char *kaddr;
+ size_t copied, len, total = 0;
+
+ do {
+ len = PAGE_CACHE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ kaddr = kmap_atomic(*pages, KM_USER0);
+ copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+ *iov, *iov_ofs, len);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (unlikely(copied != len)) {
+ /* Do it the slow way. */
+ kaddr = kmap(*pages);
+ copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+ *iov, *iov_ofs, len);
+ kunmap(*pages);
+ if (unlikely(copied != len))
+ goto err_out;
+ }
+ total += len;
+ bytes -= len;
+ if (!bytes)
+ break;
+ ntfs_set_next_iovec(iov, iov_ofs, len);
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err_out:
+ total += copied;
+ /* Zero the rest of the target like __copy_from_user(). */
+ while (++pages < last_page) {
+ bytes -= len;
+ if (!bytes)
+ break;
+ len = PAGE_CACHE_SIZE;
+ if (len > bytes)
+ len = bytes;
+ kaddr = kmap_atomic(*pages, KM_USER0);
+ memset(kaddr, 0, len);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ goto out;
+}
+
+static inline void ntfs_flush_dcache_pages(struct page **pages,
+ unsigned nr_pages)
+{
+ BUG_ON(!nr_pages);
+ do {
+ /*
+ * Warning: Do not do the decrement at the same time as the
+ * call because flush_dcache_page() is a NULL macro on i386
+ * and hence the decrement never happens.
+ */
+ flush_dcache_page(pages[nr_pages]);
+ } while (--nr_pages > 0);
+}
+
+/**
+ * ntfs_commit_pages_after_non_resident_write - commit the received data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * See description of ntfs_commit_pages_after_write(), below.
+ */
+static inline int ntfs_commit_pages_after_non_resident_write(
+ struct page **pages, const unsigned nr_pages,
+ s64 pos, size_t bytes)
+{
+ s64 end, initialized_size;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni;
+ struct buffer_head *bh, *head;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ unsigned long flags;
+ unsigned blocksize, u;
+ int err;
+
+ vi = pages[0]->mapping->host;
+ ni = NTFS_I(vi);
+ blocksize = 1 << vi->i_blkbits;
+ end = pos + bytes;
+ u = 0;
+ do {
+ s64 bh_pos;
+ struct page *page;
+ BOOL partial;
+
+ page = pages[u];
+ bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh = head = page_buffers(page);
+ partial = FALSE;
+ do {
+ s64 bh_end;
+
+ bh_end = bh_pos + blocksize;
+ if (bh_end <= pos || bh_pos >= end) {
+ if (!buffer_uptodate(bh))
+ partial = TRUE;
+ } else {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ }
+ } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+ /*
+ * If all buffers are now uptodate but the page is not, set the
+ * page uptodate.
+ */
+ if (!partial && !PageUptodate(page))
+ SetPageUptodate(page);
+ } while (++u < nr_pages);
+ /*
+ * Finally, if we do not need to update initialized_size or i_size we
+ * are finished.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end <= initialized_size) {
+ ntfs_debug("Done.");
+ return 0;
+ }
+ /*
+ * Update initialized_size/i_size as appropriate, both in the inode and
+ * the mft record.
+ */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Map, pin, and lock the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ BUG_ON(!NInoNonResident(ni));
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ write_lock_irqsave(&ni->size_lock, flags);
+ BUG_ON(end > ni->allocated_size);
+ ni->initialized_size = end;
+ a->data.non_resident.initialized_size = cpu_to_sle64(end);
+ if (end > i_size_read(vi)) {
+ i_size_write(vi, end);
+ a->data.non_resident.data_size =
+ a->data.non_resident.initialized_size;
+ }
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done.");
+ return 0;
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
+ "code %i).", err);
+ if (err != -ENOMEM) {
+ NVolSetErrors(ni->vol);
+ make_bad_inode(VFS_I(base_ni));
+ make_bad_inode(vi);
+ }
+ return err;
+}
+
+/**
+ * ntfs_commit_pages_after_write - commit the received data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * This is called from ntfs_file_buffered_write() with i_sem held on the inode
+ * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
+ * locked but not kmap()ped. The source data has already been copied into the
+ * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
+ * the data was copied (for non-resident attributes only) and it returned
+ * success.
+ *
+ * Need to set uptodate and mark dirty all buffers within the boundary of the
+ * write. If all buffers in a page are uptodate we set the page uptodate, too.
+ *
+ * Setting the buffers dirty ensures that they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Finally, we need to update i_size and initialized_size as appropriate both
+ * in the inode and the mft record.
+ *
+ * This is modelled after fs/buffer.c::generic_commit_write(), which marks
+ * buffers uptodate and dirty, sets the page uptodate if all buffers in the
+ * page are uptodate, and updates i_size if the end of io is beyond i_size. In
+ * that case, it also marks the inode dirty.
+ *
+ * If things have gone as outlined in
+ * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
+ * content modifications here for non-resident attributes. For resident
+ * attributes we need to do the uptodate bringing here which we combine with
+ * the copying into the mft record which means we save one atomic kmap.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_commit_pages_after_write(struct page **pages,
+ const unsigned nr_pages, s64 pos, size_t bytes)
+{
+ s64 end, initialized_size;
+ loff_t i_size;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni;
+ struct page *page;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ char *kattr, *kaddr;
+ unsigned long flags;
+ u32 attr_len;
+ int err;
+
+ BUG_ON(!nr_pages);
+ BUG_ON(!pages);
+ page = pages[0];
+ BUG_ON(!page);
+ vi = page->mapping->host;
+ ni = NTFS_I(vi);
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+ "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+ vi->i_ino, ni->type, page->index, nr_pages,
+ (long long)pos, bytes);
+ if (NInoNonResident(ni))
+ return ntfs_commit_pages_after_non_resident_write(pages,
+ nr_pages, pos, bytes);
+ BUG_ON(nr_pages > 1);
+ /*
+ * Attribute is resident, implying it is not compressed, encrypted, or
+ * sparse.
+ */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ BUG_ON(NInoNonResident(ni));
+ /* Map, pin, and lock the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ a = ctx->attr;
+ BUG_ON(a->non_resident);
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ i_size = i_size_read(vi);
+ BUG_ON(attr_len != i_size);
+ BUG_ON(pos > attr_len);
+ end = pos + bytes;
+ BUG_ON(end > le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset));
+ kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+ kaddr = kmap_atomic(page, KM_USER0);
+ /* Copy the received data from the page to the mft record. */
+ memcpy(kattr + pos, kaddr + pos, bytes);
+ /* Update the attribute length if necessary. */
+ if (end > attr_len) {
+ attr_len = end;
+ a->data.resident.value_length = cpu_to_le32(attr_len);
+ }
+ /*
+ * If the page is not uptodate, bring the out of bounds area(s)
+ * uptodate by copying data from the mft record to the page.
+ */
+ if (!PageUptodate(page)) {
+ if (pos > 0)
+ memcpy(kaddr, kattr, pos);
+ if (end < attr_len)
+ memcpy(kaddr + end, kattr + end, attr_len - end);
+ /* Zero the region outside the end of the attribute value. */
+ memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+ /* Update initialized_size/i_size if necessary. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ BUG_ON(end > ni->allocated_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ BUG_ON(initialized_size != i_size);
+ if (end > initialized_size) {
+ unsigned long flags;
+
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = end;
+ i_size_write(vi, end);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ }
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done.");
+ return 0;
+err_out:
+ if (err == -ENOMEM) {
+ ntfs_warning(vi->i_sb, "Error allocating memory required to "
+ "commit the write.");
+ if (PageUptodate(page)) {
+ ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+ "dirty so the write will be retried "
+ "later on by the VM.");
+ /*
+ * Put the page on mapping->dirty_pages, but leave its
+ * buffers' dirty state as-is.
+ */
+ __set_page_dirty_nobuffers(page);
+ err = 0;
+ } else
+ ntfs_error(vi->i_sb, "Page is not uptodate. Written "
+ "data has been lost.");
+ } else {
+ ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+ "with error %i.", err);
+ NVolSetErrors(ni->vol);
+ make_bad_inode(VFS_I(base_ni));
+ make_bad_inode(vi);
+ }
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ return err;
+}
+
+/**
+ * ntfs_file_buffered_write -
+ *
+ * Locking: The vfs is holding ->i_sem on the inode.
+ */
+static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs,
+ loff_t pos, loff_t *ppos, size_t count)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *vi = mapping->host;
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+ struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
+ struct page *cached_page = NULL;
+ char __user *buf = NULL;
+ s64 end, ll;
+ VCN last_vcn;
+ LCN lcn;
+ unsigned long flags;
+ size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
+ ssize_t status, written;
+ unsigned nr_pages;
+ int err;
+ struct pagevec lru_pvec;
+
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+ "pos 0x%llx, count 0x%lx.",
+ vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)pos, (unsigned long)count);
+ if (unlikely(!count))
+ return 0;
+ BUG_ON(NInoMstProtected(ni));
+ /*
+ * If the attribute is not an index root and it is encrypted or
+ * compressed, we cannot write to it yet. Note we need to check for
+ * AT_INDEX_ALLOCATION since this is the type of both directory and
+ * index inodes.
+ */
+ if (ni->type != AT_INDEX_ALLOCATION) {
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ /*
+ * Reminder for later: Encrypted files are _always_
+ * non-resident so that the content can always be
+ * encrypted.
+ */
+ ntfs_debug("Denying write access to encrypted file.");
+ return -EACCES;
+ }
+ if (NInoCompressed(ni)) {
+ /* Only unnamed $DATA attribute can be compressed. */
+ BUG_ON(ni->type != AT_DATA);
+ BUG_ON(ni->name_len);
+ /*
+ * Reminder for later: If resident, the data is not
+ * actually compressed. Only on the switch to non-
+ * resident does compression kick in. This is in
+ * contrast to encrypted files (see above).
+ */
+ ntfs_error(vi->i_sb, "Writing to compressed files is "
+ "not implemented yet. Sorry.");
+ return -EOPNOTSUPP;
+ }
+ }
+ /*
+ * If a previous ntfs_truncate() failed, repeat it and abort if it
+ * fails again.
+ */
+ if (unlikely(NInoTruncateFailed(ni))) {
+ down_write(&vi->i_alloc_sem);
+ err = ntfs_truncate(vi);
+ up_write(&vi->i_alloc_sem);
+ if (err || NInoTruncateFailed(ni)) {
+ if (!err)
+ err = -EIO;
+ ntfs_error(vol->sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "ntfs_truncate() failed (error code "
+ "%i).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ return err;
+ }
+ }
+ /* The first byte after the write. */
+ end = pos + count;
+ /*
+ * If the write goes beyond the allocated size, extend the allocation
+ * to cover the whole of the write, rounded up to the nearest cluster.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end > ll) {
+ /* Extend the allocation without changing the data size. */
+ ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ if (likely(ll >= 0)) {
+ BUG_ON(pos >= ll);
+ /* If the extension was partial truncate the write. */
+ if (end > ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "the allocation was only "
+ "partially extended.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ end = ll;
+ count = ll - pos;
+ }
+ } else {
+ err = ll;
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Perform a partial write if possible or fail. */
+ if (pos < ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "extending the allocation "
+ "failed (error code %i).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type), err);
+ end = ll;
+ count = ll - pos;
+ } else {
+ ntfs_error(vol->sb, "Cannot perform write to "
+ "inode 0x%lx, attribute type "
+ "0x%x, because extending the "
+ "allocation failed (error "
+ "code %i).", vi->i_ino,
+ (unsigned)
+ le32_to_cpu(ni->type), err);
+ return err;
+ }
+ }
+ }
+ pagevec_init(&lru_pvec, 0);
+ written = 0;
+ /*
+ * If the write starts beyond the initialized size, extend it up to the
+ * beginning of the write and initialize all non-sparse space between
+ * the old initialized size and the new one. This automatically also
+ * increments the vfs inode->i_size to keep it above or equal to the
+ * initialized_size.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (pos > ll) {
+ err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
+ &lru_pvec);
+ if (err < 0) {
+ ntfs_error(vol->sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "extending the initialized size "
+ "failed (error code %i).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ status = err;
+ goto err_out;
+ }
+ }
+ /*
+ * Determine the number of pages per cluster for non-resident
+ * attributes.
+ */
+ nr_pages = 1;
+ if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
+ nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+ /* Finally, perform the actual write. */
+ last_vcn = -1;
+ if (likely(nr_segs == 1))
+ buf = iov->iov_base;
+ do {
+ VCN vcn;
+ pgoff_t idx, start_idx;
+ unsigned ofs, do_pages, u;
+ size_t copied;
+
+ start_idx = idx = pos >> PAGE_CACHE_SHIFT;
+ ofs = pos & ~PAGE_CACHE_MASK;
+ bytes = PAGE_CACHE_SIZE - ofs;
+ do_pages = 1;
+ if (nr_pages > 1) {
+ vcn = pos >> vol->cluster_size_bits;
+ if (vcn != last_vcn) {
+ last_vcn = vcn;
+ /*
+ * Get the lcn of the vcn the write is in. If
+ * it is a hole, need to lock down all pages in
+ * the cluster.
+ */
+ down_read(&ni->runlist.lock);
+ lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
+ vol->cluster_size_bits, FALSE);
+ up_read(&ni->runlist.lock);
+ if (unlikely(lcn < LCN_HOLE)) {
+ status = -EIO;
+ if (lcn == LCN_ENOMEM)
+ status = -ENOMEM;
+ else
+ ntfs_error(vol->sb, "Cannot "
+ "perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because the attribute "
+ "is corrupt.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ break;
+ }
+ if (lcn == LCN_HOLE) {
+ start_idx = (pos & ~(s64)
+ vol->cluster_size_mask)
+ >> PAGE_CACHE_SHIFT;
+ bytes = vol->cluster_size - (pos &
+ vol->cluster_size_mask);
+ do_pages = nr_pages;
+ }
+ }
+ }
+ if (bytes > count)
+ bytes = count;
+ /*
+ * Bring in the user page(s) that we will copy from _first_.
+ * Otherwise there is a nasty deadlock on copying from the same
+ * page(s) as we are writing to, without it/them being marked
+ * up-to-date. Note, at present there is nothing to stop the
+ * pages being swapped out between us bringing them into memory
+ * and doing the actual copying.
+ */
+ if (likely(nr_segs == 1))
+ ntfs_fault_in_pages_readable(buf, bytes);
+ else
+ ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+ /* Get and lock @do_pages starting at index @start_idx. */
+ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
+ pages, &cached_page, &lru_pvec);
+ if (unlikely(status))
+ break;
+ /*
+ * For non-resident attributes, we need to fill any holes with
+ * actual clusters and ensure all bufferes are mapped. We also
+ * need to bring uptodate any buffers that are only partially
+ * being written to.
+ */
+ if (NInoNonResident(ni)) {
+ status = ntfs_prepare_pages_for_non_resident_write(
+ pages, do_pages, pos, bytes);
+ if (unlikely(status)) {
+ loff_t i_size;
+
+ do {
+ unlock_page(pages[--do_pages]);
+ page_cache_release(pages[do_pages]);
+ } while (do_pages);
+ /*
+ * The write preparation may have instantiated
+ * allocated space outside i_size. Trim this
+ * off again. We can ignore any errors in this
+ * case as we will just be waisting a bit of
+ * allocated space, which is not a disaster.
+ */
+ i_size = i_size_read(vi);
+ if (pos + bytes > i_size)
+ vmtruncate(vi, i_size);
+ break;
+ }
+ }
+ u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+ if (likely(nr_segs == 1)) {
+ copied = ntfs_copy_from_user(pages + u, do_pages - u,
+ ofs, buf, bytes);
+ buf += copied;
+ } else
+ copied = ntfs_copy_from_user_iovec(pages + u,
+ do_pages - u, ofs, &iov, &iov_ofs,
+ bytes);
+ ntfs_flush_dcache_pages(pages + u, do_pages - u);
+ status = ntfs_commit_pages_after_write(pages, do_pages, pos,
+ bytes);
+ if (likely(!status)) {
+ written += copied;
+ count -= copied;
+ pos += copied;
+ if (unlikely(copied != bytes))
+ status = -EFAULT;
+ }
+ do {
+ unlock_page(pages[--do_pages]);
+ mark_page_accessed(pages[do_pages]);
+ page_cache_release(pages[do_pages]);
+ } while (do_pages);
+ if (unlikely(status))
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (count);
+err_out:
+ *ppos = pos;
+ if (cached_page)
+ page_cache_release(cached_page);
+ /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
+ if (likely(!status)) {
+ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
+ if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
+ status = generic_osync_inode(vi, mapping,
+ OSYNC_METADATA|OSYNC_DATA);
+ }
+ }
+ pagevec_lru_add(&lru_pvec);
+ ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
+ written ? "written" : "status", (unsigned long)written,
+ (long)status);
+ return written ? written : status;
+}
+
+/**
+ * ntfs_file_aio_write_nolock -
+ */
+static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos;
+ unsigned long seg;
+ size_t count; /* after file limit checks */
+ ssize_t written, err;
+
+ count = 0;
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ count += iv->iov_len;
+ if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+ continue;
+ if (!seg)
+ return -EFAULT;
+ nr_segs = seg;
+ count -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+ pos = *ppos;
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ /* We can write back this queue in page reclaim. */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+ if (!count)
+ goto out;
+ err = remove_suid(file->f_dentry);
+ if (err)
+ goto out;
+ inode_update_time(inode, 1);
+ written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
+ count);
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+
+/**
+ * ntfs_file_aio_write -
+ */
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
+ size_t count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ struct iovec local_iov = { .iov_base = (void __user *)buf,
+ .iov_len = count };
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ down(&inode->i_sem);
+ ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+ up(&inode->i_sem);
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err = sync_page_range(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
+/**
+ * ntfs_file_writev -
+ *
+ * Basically the same as generic_file_writev() except that it ends up calling
+ * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
+ */
+static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ down(&inode->i_sem);
+ init_sync_kiocb(&kiocb, file);
+ ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+ up(&inode->i_sem);
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
+/**
+ * ntfs_file_write - simple wrapper for ntfs_file_writev()
+ */
+static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec local_iov = { .iov_base = (void __user *)buf,
+ .iov_len = count };
+
+ return ntfs_file_writev(file, &local_iov, 1, ppos);
+}
+
+/**
* ntfs_file_fsync - sync a file to disk
* @filp: file to be synced
* @dentry: dentry describing the file to sync
@@ -113,39 +2304,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
#endif /* NTFS_RW */
struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek, /* Seek inside file. */
- .read = generic_file_read, /* Read from file. */
- .aio_read = generic_file_aio_read, /* Async read from file. */
- .readv = generic_file_readv, /* Read from file. */
+ .llseek = generic_file_llseek, /* Seek inside file. */
+ .read = generic_file_read, /* Read from file. */
+ .aio_read = generic_file_aio_read, /* Async read from file. */
+ .readv = generic_file_readv, /* Read from file. */
#ifdef NTFS_RW
- .write = generic_file_write, /* Write to file. */
- .aio_write = generic_file_aio_write, /* Async write to file. */
- .writev = generic_file_writev, /* Write to file. */
- /*.release = ,*/ /* Last file is closed. See
- fs/ext2/file.c::
- ext2_release_file() for
- how to use this to discard
- preallocated space for
- write opened files. */
- .fsync = ntfs_file_fsync, /* Sync a file to disk. */
- /*.aio_fsync = ,*/ /* Sync all outstanding async
- i/o operations on a
- kiocb. */
+ .write = ntfs_file_write, /* Write to file. */
+ .aio_write = ntfs_file_aio_write, /* Async write to file. */
+ .writev = ntfs_file_writev, /* Write to file. */
+ /*.release = ,*/ /* Last file is closed. See
+ fs/ext2/file.c::
+ ext2_release_file() for
+ how to use this to discard
+ preallocated space for
+ write opened files. */
+ .fsync = ntfs_file_fsync, /* Sync a file to disk. */
+ /*.aio_fsync = ,*/ /* Sync all outstanding async
+ i/o operations on a
+ kiocb. */
#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .mmap = generic_file_mmap, /* Mmap file. */
- .open = ntfs_file_open, /* Open file. */
- .sendfile = generic_file_sendfile, /* Zero-copy data send with
- the data source being on
- the ntfs partition. We
- do not need to care about
- the data destination. */
- /*.sendpage = ,*/ /* Zero-copy data send with
- the data destination being
- on the ntfs partition. We
- do not need to care about
- the data source. */
+ /*.ioctl = ,*/ /* Perform function on the
+ mounted filesystem. */
+ .mmap = generic_file_mmap, /* Mmap file. */
+ .open = ntfs_file_open, /* Open file. */
+ .sendfile = generic_file_sendfile, /* Zero-copy data send with
+ the data source being on
+ the ntfs partition. We do
+ not need to care about the
+ data destination. */
+ /*.sendpage = ,*/ /* Zero-copy data send with
+ the data destination being
+ on the ntfs partition. We
+ do not need to care about
+ the data source. */
};
struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 7ec04513180..b24f4c4b2c5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,6 +30,7 @@
#include "debug.h"
#include "inode.h"
#include "attrib.h"
+#include "lcnalloc.h"
#include "malloc.h"
#include "mft.h"
#include "time.h"
@@ -2291,11 +2292,16 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
#ifdef NTFS_RW
+static const char *es = " Leaving inconsistent metadata. Unmount and run "
+ "chkdsk.";
+
/**
* ntfs_truncate - called when the i_size of an ntfs inode is changed
* @vi: inode for which the i_size was changed
*
- * We do not support i_size changes yet.
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted. This is enforced in ntfs_setattr(), see
+ * below.
*
* The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
* that the change is allowed.
@@ -2306,80 +2312,499 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
* Returns 0 on success or -errno on error.
*
* Called with ->i_sem held. In all but one case ->i_alloc_sem is held for
- * writing. The only case where ->i_alloc_sem is not held is
+ * writing. The only case in the kernel where ->i_alloc_sem is not held is
* mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
- * with the current i_size as the offset which means that it is a noop as far
- * as ntfs_truncate() is concerned.
+ * with the current i_size as the offset. The analogous place in NTFS is in
+ * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
+ * without holding ->i_alloc_sem.
*/
int ntfs_truncate(struct inode *vi)
{
- ntfs_inode *ni = NTFS_I(vi);
+ s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+ VCN highest_vcn;
+ unsigned long flags;
+ ntfs_inode *base_ni, *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
ntfs_attr_search_ctx *ctx;
MFT_RECORD *m;
ATTR_RECORD *a;
const char *te = " Leaving file length out of sync with i_size.";
- int err;
+ int err, mp_size, size_change, alloc_change;
+ u32 attr_len;
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
BUG_ON(NInoAttr(ni));
+ BUG_ON(S_ISDIR(vi->i_mode));
+ BUG_ON(NInoMstProtected(ni));
BUG_ON(ni->nr_extents < 0);
- m = map_mft_record(ni);
+retry_truncate:
+ /*
+ * Lock the runlist for writing and map the mft record to ensure it is
+ * safe to mess with the attribute runlist and sizes.
+ */
+ down_write(&ni->runlist.lock);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ m = map_mft_record(base_ni);
if (IS_ERR(m)) {
err = PTR_ERR(m);
ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
"(error code %d).%s", vi->i_ino, err, te);
ctx = NULL;
m = NULL;
- goto err_out;
+ goto old_bad_out;
}
- ctx = ntfs_attr_get_search_ctx(ni, m);
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
if (unlikely(!ctx)) {
ntfs_error(vi->i_sb, "Failed to allocate a search context for "
"inode 0x%lx (not enough memory).%s",
vi->i_ino, te);
err = -ENOMEM;
- goto err_out;
+ goto old_bad_out;
}
err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
CASE_SENSITIVE, 0, NULL, 0, ctx);
if (unlikely(err)) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
ntfs_error(vi->i_sb, "Open attribute is missing from "
"mft record. Inode 0x%lx is corrupt. "
- "Run chkdsk.", vi->i_ino);
- else
+ "Run chkdsk.%s", vi->i_ino, te);
+ err = -EIO;
+ } else
ntfs_error(vi->i_sb, "Failed to lookup attribute in "
- "inode 0x%lx (error code %d).",
- vi->i_ino, err);
- goto err_out;
+ "inode 0x%lx (error code %d).%s",
+ vi->i_ino, err, te);
+ goto old_bad_out;
}
+ m = ctx->mrec;
a = ctx->attr;
- /* If the size has not changed there is nothing to do. */
- if (ntfs_attr_size(a) == i_size_read(vi))
- goto done;
- // TODO: Implement the truncate...
- ntfs_error(vi->i_sb, "Inode size has changed but this is not "
- "implemented yet. Resetting inode size to old value. "
- " This is most likely a bug in the ntfs driver!");
- i_size_write(vi, ntfs_attr_size(a));
-done:
+ /*
+ * The i_size of the vfs inode is the new size for the attribute value.
+ */
+ new_size = i_size_read(vi);
+ /* The current size of the attribute value is the old size. */
+ old_size = ntfs_attr_size(a);
+ /* Calculate the new allocated size. */
+ if (NInoNonResident(ni))
+ new_alloc_size = (new_size + vol->cluster_size - 1) &
+ ~(s64)vol->cluster_size_mask;
+ else
+ new_alloc_size = (new_size + 7) & ~7;
+ /* The current allocated size is the old allocated size. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ old_alloc_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * The change in the file size. This will be 0 if no change, >0 if the
+ * size is growing, and <0 if the size is shrinking.
+ */
+ size_change = -1;
+ if (new_size - old_size >= 0) {
+ size_change = 1;
+ if (new_size == old_size)
+ size_change = 0;
+ }
+ /* As above for the allocated size. */
+ alloc_change = -1;
+ if (new_alloc_size - old_alloc_size >= 0) {
+ alloc_change = 1;
+ if (new_alloc_size == old_alloc_size)
+ alloc_change = 0;
+ }
+ /*
+ * If neither the size nor the allocation are being changed there is
+ * nothing to do.
+ */
+ if (!size_change && !alloc_change)
+ goto unm_done;
+ /* If the size is changing, check if new size is allowed in $AttrDef. */
+ if (size_change) {
+ err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+ if (unlikely(err)) {
+ if (err == -ERANGE) {
+ ntfs_error(vol->sb, "Truncate would cause the "
+ "inode 0x%lx to %simum size "
+ "for its attribute type "
+ "(0x%x). Aborting truncate.",
+ vi->i_ino,
+ new_size > old_size ? "exceed "
+ "the max" : "go under the min",
+ le32_to_cpu(ni->type));
+ err = -EFBIG;
+ } else {
+ ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+ "attribute type 0x%x. "
+ "Aborting truncate.",
+ vi->i_ino,
+ le32_to_cpu(ni->type));
+ err = -EIO;
+ }
+ /* Reset the vfs inode size to the old size. */
+ i_size_write(vi, old_size);
+ goto err_out;
+ }
+ }
+ if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+ ntfs_warning(vi->i_sb, "Changes in inode size are not "
+ "supported yet for %s files, ignoring.",
+ NInoCompressed(ni) ? "compressed" :
+ "encrypted");
+ err = -EOPNOTSUPP;
+ goto bad_out;
+ }
+ if (a->non_resident)
+ goto do_non_resident_truncate;
+ BUG_ON(NInoNonResident(ni));
+ /* Resize the attribute record to best fit the new attribute size. */
+ if (new_size < vol->mft_record_size &&
+ !ntfs_resident_attr_value_resize(m, a, new_size)) {
+ unsigned long flags;
+
+ /* The resize succeeded! */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ write_lock_irqsave(&ni->size_lock, flags);
+ /* Update the sizes in the ntfs inode and all is done. */
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset);
+ /*
+ * Note ntfs_resident_attr_value_resize() has already done any
+ * necessary data clearing in the attribute record. When the
+ * file is being shrunk vmtruncate() will already have cleared
+ * the top part of the last partial page, i.e. since this is
+ * the resident case this is the page with index 0. However,
+ * when the file is being expanded, the page cache page data
+ * between the old data_size, i.e. old_size, and the new_size
+ * has not been zeroed. Fortunately, we do not need to zero it
+ * either since on one hand it will either already be zero due
+ * to both readpage and writepage clearing partial page data
+ * beyond i_size in which case there is nothing to do or in the
+ * case of the file being mmap()ped at the same time, POSIX
+ * specifies that the behaviour is unspecified thus we do not
+ * have to do anything. This means that in our implementation
+ * in the rare case that the file is mmap()ped and a write
+ * occured into the mmap()ped region just beyond the file size
+ * and writepage has not yet been called to write out the page
+ * (which would clear the area beyond the file size) and we now
+ * extend the file size to incorporate this dirty region
+ * outside the file size, a write of the page would result in
+ * this data being written to disk instead of being cleared.
+ * Given both POSIX and the Linux mmap(2) man page specify that
+ * this corner case is undefined, we choose to leave it like
+ * that as this is much simpler for us as we cannot lock the
+ * relevant page now since we are holding too many ntfs locks
+ * which would result in a lock reversal deadlock.
+ */
+ ni->initialized_size = new_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ goto unm_done;
+ }
+ /* If the above resize failed, this must be an attribute extension. */
+ BUG_ON(size_change < 0);
+ /*
+ * We have to drop all the locks so we can call
+ * ntfs_attr_make_non_resident(). This could be optimised by try-
+ * locking the first page cache page and only if that fails dropping
+ * the locks, locking the page, and redoing all the locking and
+ * lookups. While this would be a huge optimisation, it is not worth
+ * it as this is definitely a slow code path as it only ever can happen
+ * once for any given file.
+ */
ntfs_attr_put_search_ctx(ctx);
- unmap_mft_record(ni);
- NInoClearTruncateFailed(ni);
- ntfs_debug("Done.");
- return 0;
-err_out:
- if (err != -ENOMEM) {
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * Not enough space in the mft record, try to make the attribute
+ * non-resident and if successful restart the truncation process.
+ */
+ err = ntfs_attr_make_non_resident(ni, old_size);
+ if (likely(!err))
+ goto retry_truncate;
+ /*
+ * Could not make non-resident. If this is due to this not being
+ * permitted for this attribute type or there not being enough space,
+ * try to make other attributes non-resident. Otherwise fail.
+ */
+ if (unlikely(err != -EPERM && err != -ENOSPC)) {
+ ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+ "type 0x%x, because the conversion from "
+ "resident to non-resident attribute failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto conv_err_out;
+ }
+ /* TODO: Not implemented from here, abort. */
+ if (err == -ENOSPC)
+ ntfs_error(vol->sb, "Not enough space in the mft record/on "
+ "disk for the non-resident attribute value. "
+ "This case is not implemented yet.");
+ else /* if (err == -EPERM) */
+ ntfs_error(vol->sb, "This attribute type may not be "
+ "non-resident. This case is not implemented "
+ "yet.");
+ err = -EOPNOTSUPP;
+ goto conv_err_out;
+#if 0
+ // TODO: Attempt to make other attributes non-resident.
+ if (!err)
+ goto do_resident_extend;
+ /*
+ * Both the attribute list attribute and the standard information
+ * attribute must remain in the base inode. Thus, if this is one of
+ * these attributes, we have to try to move other attributes out into
+ * extent mft records instead.
+ */
+ if (ni->type == AT_ATTRIBUTE_LIST ||
+ ni->type == AT_STANDARD_INFORMATION) {
+ // TODO: Attempt to move other attributes into extent mft
+ // records.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ goto err_out;
+ }
+ // TODO: Attempt to move this attribute to an extent mft record, but
+ // only if it is not already the only attribute in an mft record in
+ // which case there would be nothing to gain.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ /* There is nothing we can do to make enough space. )-: */
+ goto err_out;
+#endif
+do_non_resident_truncate:
+ BUG_ON(!NInoNonResident(ni));
+ if (alloc_change < 0) {
+ highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ if (highest_vcn > 0 &&
+ old_alloc_size >> vol->cluster_size_bits >
+ highest_vcn + 1) {
+ /*
+ * This attribute has multiple extents. Not yet
+ * supported.
+ */
+ ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+ "attribute type 0x%x, because the "
+ "attribute is highly fragmented (it "
+ "consists of multiple extents) and "
+ "this case is not implemented yet.",
+ vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type));
+ err = -EOPNOTSUPP;
+ goto bad_out;
+ }
+ }
+ /*
+ * If the size is shrinking, need to reduce the initialized_size and
+ * the data_size before reducing the allocation.
+ */
+ if (size_change < 0) {
+ /*
+ * Make the valid size smaller (i_size is already up-to-date).
+ */
+ write_lock_irqsave(&ni->size_lock, flags);
+ if (new_size < ni->initialized_size) {
+ ni->initialized_size = new_size;
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(new_size);
+ }
+ a->data.non_resident.data_size = cpu_to_sle64(new_size);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /* If the allocated size is not changing, we are done. */
+ if (!alloc_change)
+ goto unm_done;
+ /*
+ * If the size is shrinking it makes no sense for the
+ * allocation to be growing.
+ */
+ BUG_ON(alloc_change > 0);
+ } else /* if (size_change >= 0) */ {
+ /*
+ * The file size is growing or staying the same but the
+ * allocation can be shrinking, growing or staying the same.
+ */
+ if (alloc_change > 0) {
+ /*
+ * We need to extend the allocation and possibly update
+ * the data size. If we are updating the data size,
+ * since we are not touching the initialized_size we do
+ * not need to worry about the actual data on disk.
+ * And as far as the page cache is concerned, there
+ * will be no pages beyond the old data size and any
+ * partial region in the last page between the old and
+ * new data size (or the end of the page if the new
+ * data size is outside the page) does not need to be
+ * modified as explained above for the resident
+ * attribute truncate case. To do this, we simply drop
+ * the locks we hold and leave all the work to our
+ * friendly helper ntfs_attr_extend_allocation().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ err = ntfs_attr_extend_allocation(ni, new_size,
+ size_change > 0 ? new_size : -1, -1);
+ /*
+ * ntfs_attr_extend_allocation() will have done error
+ * output already.
+ */
+ goto done;
+ }
+ if (!alloc_change)
+ goto alloc_done;
+ }
+ /* alloc_change < 0 */
+ /* Free the clusters. */
+ nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+ vol->cluster_size_bits, -1, ctx);
+ m = ctx->mrec;
+ a = ctx->attr;
+ if (unlikely(nr_freed < 0)) {
+ ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+ "%lli). Unmount and run chkdsk to recover "
+ "the lost cluster(s).", (long long)nr_freed);
NVolSetErrors(vol);
+ nr_freed = 0;
+ }
+ /* Truncate the runlist. */
+ err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+ new_alloc_size >> vol->cluster_size_bits);
+ /*
+ * If the runlist truncation failed and/or the search context is no
+ * longer valid, we cannot resize the attribute record or build the
+ * mapping pairs array thus we mark the inode bad so that no access to
+ * the freed clusters can happen.
+ */
+ if (unlikely(err || IS_ERR(m))) {
+ ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+ IS_ERR(m) ?
+ "restore attribute search context" :
+ "truncate attribute runlist",
+ IS_ERR(m) ? PTR_ERR(m) : err, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /* Get the size for the shrunk mapping pairs array for the runlist. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+ if (unlikely(mp_size <= 0)) {
+ ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+ "attribute type 0x%x, because determining the "
+ "size for the mapping pairs failed with error "
+ "code %i.%s", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), mp_size, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /*
+ * Shrink the attribute record for the new mapping pairs array. Note,
+ * this cannot fail since we are making the attribute smaller thus by
+ * definition there is enough space to do so.
+ */
+ attr_len = le32_to_cpu(a->length);
+ err = ntfs_attr_record_resize(m, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ BUG_ON(err);
+ /*
+ * Generate the mapping pairs array directly into the attribute record.
+ */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, ni->runlist.rl, 0, -1, NULL);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+ "attribute type 0x%x, because building the "
+ "mapping pairs failed with error code %i.%s",
+ vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+ err, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /* Update the allocated/compressed size as well as the highest vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+ vol->cluster_size_bits) - 1);
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ if (nr_freed) {
+ ni->itype.compressed.size -= nr_freed <<
+ vol->cluster_size_bits;
+ BUG_ON(ni->itype.compressed.size < 0);
+ a->data.non_resident.compressed_size = cpu_to_sle64(
+ ni->itype.compressed.size);
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ }
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * We have shrunk the allocation. If this is a shrinking truncate we
+ * have already dealt with the initialized_size and the data_size above
+ * and we are done. If the truncate is only changing the allocation
+ * and not the data_size, we are also done. If this is an extending
+ * truncate, need to extend the data_size now which is ensured by the
+ * fact that @size_change is positive.
+ */
+alloc_done:
+ /*
+ * If the size is growing, need to update it now. If it is shrinking,
+ * we have already updated it above (before the allocation change).
+ */
+ if (size_change > 0)
+ a->data.non_resident.data_size = cpu_to_sle64(new_size);
+ /* Ensure the modified mft record is written out. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+done:
+ /* Update the mtime and ctime on the base inode. */
+ inode_update_time(VFS_I(base_ni), 1);
+ if (likely(!err)) {
+ NInoClearTruncateFailed(ni);
+ ntfs_debug("Done.");
+ }
+ return err;
+old_bad_out:
+ old_size = -1;
+bad_out:
+ if (err != -ENOMEM && err != -EOPNOTSUPP) {
make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
}
+ if (err != -EOPNOTSUPP)
+ NInoSetTruncateFailed(ni);
+ else if (old_size >= 0)
+ i_size_write(vi, old_size);
+err_out:
if (ctx)
ntfs_attr_put_search_ctx(ctx);
if (m)
- unmap_mft_record(ni);
- NInoSetTruncateFailed(ni);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+out:
+ ntfs_debug("Failed. Returning error code %i.", err);
return err;
+conv_err_out:
+ if (err != -ENOMEM && err != -EOPNOTSUPP) {
+ make_bad_inode(vi);
+ make_bad_inode(VFS_I(base_ni));
+ NVolSetErrors(vol);
+ }
+ if (err != -EOPNOTSUPP)
+ NInoSetTruncateFailed(ni);
+ else
+ i_size_write(vi, old_size);
+ goto out;
}
/**
@@ -2420,8 +2845,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
err = inode_change_ok(vi, attr);
if (err)
- return err;
-
+ goto out;
/* We do not support NTFS ACLs yet. */
if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
@@ -2429,14 +2853,22 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
err = -EOPNOTSUPP;
goto out;
}
-
if (ia_valid & ATTR_SIZE) {
if (attr->ia_size != i_size_read(vi)) {
- ntfs_warning(vi->i_sb, "Changes in inode size are not "
- "supported yet, ignoring.");
- err = -EOPNOTSUPP;
- // TODO: Implement...
- // err = vmtruncate(vi, attr->ia_size);
+ ntfs_inode *ni = NTFS_I(vi);
+ /*
+ * FIXME: For now we do not support resizing of
+ * compressed or encrypted files yet.
+ */
+ if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+ ntfs_warning(vi->i_sb, "Changes in inode size "
+ "are not supported yet for "
+ "%s files, ignoring.",
+ NInoCompressed(ni) ?
+ "compressed" : "encrypted");
+ err = -EOPNOTSUPP;
+ } else
+ err = vmtruncate(vi, attr->ia_size);
if (err || ia_valid == ATTR_SIZE)
goto out;
} else {
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 5c248d404f0..f5678d5d791 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -1021,10 +1021,17 @@ enum {
FILE_NAME_POSIX = 0x00,
/* This is the largest namespace. It is case sensitive and allows all
Unicode characters except for: '\0' and '/'. Beware that in
- WinNT/2k files which eg have the same name except for their case
- will not be distinguished by the standard utilities and thus a "del
- filename" will delete both "filename" and "fileName" without
- warning. */
+ WinNT/2k/2003 by default files which eg have the same name except
+ for their case will not be distinguished by the standard utilities
+ and thus a "del filename" will delete both "filename" and "fileName"
+ without warning. However if for example Services For Unix (SFU) are
+ installed and the case sensitive option was enabled at installation
+ time, then you can create/access/delete such files.
+ Note that even SFU places restrictions on the filenames beyond the
+ '\0' and '/' and in particular the following set of characters is
+ not allowed: '"', '/', '<', '>', '\'. All other characters,
+ including the ones no allowed in WIN32 namespace are allowed.
+ Tested with SFU 3.5 (this is now free) running on Windows XP. */
FILE_NAME_WIN32 = 0x01,
/* The standard WinNT/2k NTFS long filenames. Case insensitive. All
Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
@@ -2367,7 +2374,9 @@ typedef struct {
* Extended attribute flags (8-bit).
*/
enum {
- NEED_EA = 0x80
+ NEED_EA = 0x80 /* If set the file to which the EA belongs
+ cannot be interpreted without understanding
+ the associates extended attributes. */
} __attribute__ ((__packed__));
typedef u8 EA_FLAGS;
@@ -2375,20 +2384,20 @@ typedef u8 EA_FLAGS;
/*
* Attribute: Extended attribute (EA) (0xe0).
*
- * NOTE: Always non-resident. (Is this true?)
+ * NOTE: Can be resident or non-resident.
*
* Like the attribute list and the index buffer list, the EA attribute value is
* a sequence of EA_ATTR variable length records.
- *
- * FIXME: It appears weird that the EA name is not unicode. Is it true?
*/
typedef struct {
le32 next_entry_offset; /* Offset to the next EA_ATTR. */
EA_FLAGS flags; /* Flags describing the EA. */
- u8 ea_name_length; /* Length of the name of the EA in bytes. */
+ u8 ea_name_length; /* Length of the name of the EA in bytes
+ excluding the '\0' byte terminator. */
le16 ea_value_length; /* Byte size of the EA's value. */
- u8 ea_name[0]; /* Name of the EA. */
- u8 ea_value[0]; /* The value of the EA. Immediately follows
+ u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
+ Unicode and it is zero terminated. */
+ u8 ea_value[0]; /* The value of the EA. Immediately follows
the name. */
} __attribute__ ((__packed__)) EA_ATTR;
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 5af3bf0b7ee..29cabf93d2d 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -76,6 +76,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
* @count: number of clusters to allocate
* @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
* @zone: zone from which to allocate the clusters
+ * @is_extension: if TRUE, this is an attribute extension
*
* Allocate @count clusters preferably starting at cluster @start_lcn or at the
* current allocator position if @start_lcn is -1, on the mounted ntfs volume
@@ -86,6 +87,13 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
* @start_vcn specifies the vcn of the first allocated cluster. This makes
* merging the resulting runlist with the old runlist easier.
*
+ * If @is_extension is TRUE, the caller is allocating clusters to extend an
+ * attribute and if it is FALSE, the caller is allocating clusters to fill a
+ * hole in an attribute. Practically the difference is that if @is_extension
+ * is TRUE the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is FALSE the runlist will be terminated with
+ * LCN_RL_NOT_MAPPED.
+ *
* You need to check the return value with IS_ERR(). If this is false, the
* function was successful and the return value is a runlist describing the
* allocated cluster(s). If IS_ERR() is true, the function failed and
@@ -137,7 +145,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
*/
runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone)
+ const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+ const BOOL is_extension)
{
LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
@@ -310,7 +319,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
continue;
}
bit = 1 << (lcn & 7);
- ntfs_debug("bit %i.", bit);
+ ntfs_debug("bit 0x%x.", bit);
/* If the bit is already set, go onto the next one. */
if (*byte & bit) {
lcn++;
@@ -729,7 +738,7 @@ out:
/* Add runlist terminator element. */
if (likely(rl)) {
rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
- rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+ rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
rl[rlpos].length = 0;
}
if (likely(page && !IS_ERR(page))) {
@@ -782,6 +791,7 @@ out:
* @ni: ntfs inode whose runlist describes the clusters to free
* @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
* @count: number of clusters to free or -1 for all clusters
+ * @ctx: active attribute search context if present or NULL if not
* @is_rollback: true if this is a rollback operation
*
* Free @count clusters starting at the cluster @start_vcn in the runlist
@@ -791,15 +801,39 @@ out:
* deallocated. Thus, to completely free all clusters in a runlist, use
* @start_vcn = 0 and @count = -1.
*
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when __ntfs_cluster_free() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
+ * perform the necessary mapping and unmapping.
+ *
+ * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning. Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry. However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
+ * you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
* @is_rollback should always be FALSE, it is for internal use to rollback
* errors. You probably want to use ntfs_cluster_free() instead.
*
- * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
- * has to deal with it later.
+ * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
+ * remove from the runlist or mark sparse the freed runs later.
*
* Return the number of deallocated clusters (not counting sparse ones) on
* success and -errno on error.
*
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
* Locking: - The runlist described by @ni must be locked for writing on entry
* and is locked on return. Note the runlist may be modified when
* needed runlist fragments need to be mapped.
@@ -807,9 +841,13 @@ out:
* on return.
* - This function takes the volume lcn bitmap lock for writing and
* modifies the bitmap contents.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
*/
s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
- const BOOL is_rollback)
+ ntfs_attr_search_ctx *ctx, const BOOL is_rollback)
{
s64 delta, to_free, total_freed, real_freed;
ntfs_volume *vol;
@@ -839,7 +877,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
total_freed = real_freed = 0;
- rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, TRUE);
+ rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
if (IS_ERR(rl)) {
if (!is_rollback)
ntfs_error(vol->sb, "Failed to find first runlist "
@@ -893,7 +931,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
/* Attempt to map runlist. */
vcn = rl->vcn;
- rl = ntfs_attr_find_vcn_nolock(ni, vcn, TRUE);
+ rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
if (IS_ERR(rl)) {
err = PTR_ERR(rl);
if (!is_rollback)
@@ -961,7 +999,7 @@ err_out:
* If rollback fails, set the volume errors flag, emit an error
* message, and return the error code.
*/
- delta = __ntfs_cluster_free(ni, start_vcn, total_freed, TRUE);
+ delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, TRUE);
if (delta < 0) {
ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
"inconsistent metadata! Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index a6a8827882e..72cbca7003b 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -27,6 +27,7 @@
#include <linux/fs.h>
+#include "attrib.h"
#include "types.h"
#include "inode.h"
#include "runlist.h"
@@ -41,16 +42,18 @@ typedef enum {
extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
const VCN start_vcn, const s64 count, const LCN start_lcn,
- const NTFS_CLUSTER_ALLOCATION_ZONES zone);
+ const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+ const BOOL is_extension);
extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count, const BOOL is_rollback);
+ s64 count, ntfs_attr_search_ctx *ctx, const BOOL is_rollback);
/**
* ntfs_cluster_free - free clusters on an ntfs volume
* @ni: ntfs inode whose runlist describes the clusters to free
* @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
* @count: number of clusters to free or -1 for all clusters
+ * @ctx: active attribute search context if present or NULL if not
*
* Free @count clusters starting at the cluster @start_vcn in the runlist
* described by the ntfs inode @ni.
@@ -59,12 +62,36 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
* deallocated. Thus, to completely free all clusters in a runlist, use
* @start_vcn = 0 and @count = -1.
*
- * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
- * has to deal with it later.
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
+ * fragments and allows their mapping. If you do not have the mft record
+ * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
+ * the necessary mapping and unmapping.
+ *
+ * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning. Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry. However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
+ * you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
+ * from the runlist or mark sparse the freed runs later.
*
* Return the number of deallocated clusters (not counting sparse ones) on
* success and -errno on error.
*
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
* Locking: - The runlist described by @ni must be locked for writing on entry
* and is locked on return. Note the runlist may be modified when
* needed runlist fragments need to be mapped.
@@ -72,11 +99,15 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
* on return.
* - This function takes the volume lcn bitmap lock for writing and
* modifies the bitmap contents.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
*/
static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
- s64 count)
+ s64 count, ntfs_attr_search_ctx *ctx)
{
- return __ntfs_cluster_free(ni, start_vcn, count, FALSE);
+ return __ntfs_cluster_free(ni, start_vcn, count, ctx, FALSE);
}
extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 590887b943f..e38e402e410 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -39,8 +39,7 @@
* If there was insufficient memory to complete the request, return NULL.
* Depending on @gfp_mask the allocation may be guaranteed to succeed.
*/
-static inline void *__ntfs_malloc(unsigned long size,
- gfp_t gfp_mask)
+static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
{
if (likely(size <= PAGE_SIZE)) {
BUG_ON(!size);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b011369b595..0c65cbb8c5c 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -49,7 +49,8 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
ntfs_volume *vol = ni->vol;
struct inode *mft_vi = vol->mft_ino;
struct page *page;
- unsigned long index, ofs, end_index;
+ unsigned long index, end_index;
+ unsigned ofs;
BUG_ON(ni->page);
/*
@@ -1308,7 +1309,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
ll = mftbmp_ni->allocated_size;
read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
- (ll - 1) >> vol->cluster_size_bits, TRUE);
+ (ll - 1) >> vol->cluster_size_bits, NULL);
if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
up_write(&mftbmp_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1354,7 +1355,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
up_write(&vol->lcnbmp_lock);
ntfs_unmap_page(page);
/* Allocate a cluster from the DATA_ZONE. */
- rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE);
+ rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
+ TRUE);
if (IS_ERR(rl2)) {
up_write(&mftbmp_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to allocate a cluster for "
@@ -1738,7 +1740,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
ll = mft_ni->allocated_size;
read_unlock_irqrestore(&mft_ni->size_lock, flags);
rl = ntfs_attr_find_vcn_nolock(mft_ni,
- (ll - 1) >> vol->cluster_size_bits, TRUE);
+ (ll - 1) >> vol->cluster_size_bits, NULL);
if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
up_write(&mft_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1779,7 +1781,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
nr > min_nr ? "default" : "minimal", (long long)nr);
old_last_vcn = rl[1].vcn;
do {
- rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE);
+ rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
+ TRUE);
if (likely(!IS_ERR(rl2)))
break;
if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
@@ -1951,20 +1954,21 @@ restore_undo_alloc:
NVolSetErrors(vol);
return ret;
}
- a = ctx->attr;
- a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
+ ctx->attr->data.non_resident.highest_vcn =
+ cpu_to_sle64(old_last_vcn - 1);
undo_alloc:
- if (ntfs_cluster_free(mft_ni, old_last_vcn, -1) < 0) {
+ if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
ntfs_error(vol->sb, "Failed to free clusters from mft data "
"attribute.%s", es);
NVolSetErrors(vol);
}
+ a = ctx->attr;
if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
ntfs_error(vol->sb, "Failed to truncate mft data attribute "
"runlist.%s", es);
NVolSetErrors(vol);
}
- if (mp_rebuilt) {
+ if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
a->data.non_resident.mapping_pairs_offset),
old_alen - le16_to_cpu(
@@ -1981,6 +1985,10 @@ undo_alloc:
}
flush_dcache_mft_record_page(ctx->ntfs_ino);
mark_mft_record_dirty(ctx->ntfs_ino);
+ } else if (IS_ERR(ctx->mrec)) {
+ ntfs_error(vol->sb, "Failed to restore attribute search "
+ "context.%s", es);
+ NVolSetErrors(vol);
}
if (ctx)
ntfs_attr_put_search_ctx(ctx);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 453d0d51ea4..6c16db9e1a8 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1447,7 +1447,7 @@ not_enabled:
if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
"attribute (size is 0x%llx but should be at "
- "least 0x%x bytes).", i_size_read(tmp_ino),
+ "least 0x%zx bytes).", i_size_read(tmp_ino),
sizeof(USN_HEADER));
return FALSE;
}
diff --git a/fs/open.c b/fs/open.c
index f0d90cf0495..8d06ec911fd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -739,7 +739,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
}
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
- int flags, struct file *f)
+ int flags, struct file *f,
+ int (*open)(struct inode *, struct file *))
{
struct inode *inode;
int error;
@@ -761,11 +762,14 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
f->f_op = fops_get(inode->i_fop);
file_move(f, &inode->i_sb->s_files);
- if (f->f_op && f->f_op->open) {
- error = f->f_op->open(inode,f);
+ if (!open && f->f_op)
+ open = f->f_op->open;
+ if (open) {
+ error = open(inode, f);
if (error)
goto cleanup_all;
}
+
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -814,28 +818,75 @@ struct file *filp_open(const char * filename, int flags, int mode)
{
int namei_flags, error;
struct nameidata nd;
- struct file *f;
namei_flags = flags;
if ((namei_flags+1) & O_ACCMODE)
namei_flags++;
- if (namei_flags & O_TRUNC)
- namei_flags |= 2;
-
- error = -ENFILE;
- f = get_empty_filp();
- if (f == NULL)
- return ERR_PTR(error);
error = open_namei(filename, namei_flags, mode, &nd);
if (!error)
- return __dentry_open(nd.dentry, nd.mnt, flags, f);
+ return nameidata_to_filp(&nd, flags);
- put_filp(f);
return ERR_PTR(error);
}
EXPORT_SYMBOL(filp_open);
+/**
+ * lookup_instantiate_filp - instantiates the open intent filp
+ * @nd: pointer to nameidata
+ * @dentry: pointer to dentry
+ * @open: open callback
+ *
+ * Helper for filesystems that want to use lookup open intents and pass back
+ * a fully instantiated struct file to the caller.
+ * This function is meant to be called from within a filesystem's
+ * lookup method.
+ * Note that in case of error, nd->intent.open.file is destroyed, but the
+ * path information remains valid.
+ * If the open callback is set to NULL, then the standard f_op->open()
+ * filesystem callback is substituted.
+ */
+struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
+ int (*open)(struct inode *, struct file *))
+{
+ if (IS_ERR(nd->intent.open.file))
+ goto out;
+ if (IS_ERR(dentry))
+ goto out_err;
+ nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->mnt),
+ nd->intent.open.flags - 1,
+ nd->intent.open.file,
+ open);
+out:
+ return nd->intent.open.file;
+out_err:
+ release_open_intent(nd);
+ nd->intent.open.file = (struct file *)dentry;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
+
+/**
+ * nameidata_to_filp - convert a nameidata to an open filp.
+ * @nd: pointer to nameidata
+ * @flags: open flags
+ *
+ * Note that this function destroys the original nameidata
+ */
+struct file *nameidata_to_filp(struct nameidata *nd, int flags)
+{
+ struct file *filp;
+
+ /* Pick up the filp from the open intent */
+ filp = nd->intent.open.file;
+ /* Has the filesystem initialised the file for us? */
+ if (filp->f_dentry == NULL)
+ filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL);
+ else
+ path_release(nd);
+ return filp;
+}
+
struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
int error;
@@ -846,7 +897,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
if (f == NULL)
return ERR_PTR(error);
- return __dentry_open(dentry, mnt, flags, f);
+ return __dentry_open(dentry, mnt, flags, f, NULL);
}
EXPORT_SYMBOL(dentry_open);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 77e178f1316..9c06c5434ec 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -192,6 +192,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
struct part_attribute {
struct attribute attr;
ssize_t (*show)(struct hd_struct *,char *);
+ ssize_t (*store)(struct hd_struct *,const char *, size_t);
};
static ssize_t
@@ -201,14 +202,33 @@ part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
ssize_t ret = 0;
if (part_attr->show)
- ret = part_attr->show(p,page);
+ ret = part_attr->show(p, page);
+ return ret;
+}
+static ssize_t
+part_attr_store(struct kobject * kobj, struct attribute * attr,
+ const char *page, size_t count)
+{
+ struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
+ struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
+ ssize_t ret = 0;
+
+ if (part_attr->store)
+ ret = part_attr->store(p, page, count);
return ret;
}
static struct sysfs_ops part_sysfs_ops = {
.show = part_attr_show,
+ .store = part_attr_store,
};
+static ssize_t part_uevent_store(struct hd_struct * p,
+ const char *page, size_t count)
+{
+ kobject_hotplug(&p->kobj, KOBJ_ADD);
+ return count;
+}
static ssize_t part_dev_read(struct hd_struct * p, char *page)
{
struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
@@ -229,6 +249,10 @@ static ssize_t part_stat_read(struct hd_struct * p, char *page)
p->reads, (unsigned long long)p->read_sectors,
p->writes, (unsigned long long)p->write_sectors);
}
+static struct part_attribute part_attr_uevent = {
+ .attr = {.name = "uevent", .mode = S_IWUSR },
+ .store = part_uevent_store
+};
static struct part_attribute part_attr_dev = {
.attr = {.name = "dev", .mode = S_IRUGO },
.show = part_dev_read
@@ -247,6 +271,7 @@ static struct part_attribute part_attr_stat = {
};
static struct attribute * default_attrs[] = {
+ &part_attr_uevent.attr,
&part_attr_dev.attr,
&part_attr_start.attr,
&part_attr_size.attr,
@@ -430,7 +455,7 @@ void del_gendisk(struct gendisk *disk)
disk->flags &= ~GENHD_FL_UP;
unlink_gendisk(disk);
disk_stat_set_all(disk, 0);
- disk->stamp = disk->stamp_idle = 0;
+ disk->stamp = 0;
devfs_remove_disk(disk);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbea..3e1239e4b30 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
jiffies_to_clock_t(it_real_value),
start_time,
vsize,
- mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+ mm ? get_mm_rss(mm) : 0,
rsslim,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a8c34461d4..b638fb50074 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -533,7 +533,7 @@ static void proc_kill_inodes(struct proc_dir_entry *de)
*/
file_list_lock();
list_for_each(p, &sb->s_files) {
- struct file * filp = list_entry(p, struct file, f_list);
+ struct file * filp = list_entry(p, struct file, f_u.fu_list);
struct dentry * dentry = filp->f_dentry;
struct inode * inode;
struct file_operations *fops;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index effa6c0c467..e6a818a93f3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -156,10 +156,13 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
WARN_ON(de && de->deleted);
+ if (de != NULL && !try_module_get(de->owner))
+ goto out_mod;
+
inode = iget(sb, ino);
if (!inode)
- goto out_fail;
-
+ goto out_ino;
+
PROC_I(inode)->pde = de;
if (de) {
if (de->mode) {
@@ -171,20 +174,20 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
inode->i_size = de->size;
if (de->nlink)
inode->i_nlink = de->nlink;
- if (!try_module_get(de->owner))
- goto out_fail;
if (de->proc_iops)
inode->i_op = de->proc_iops;
if (de->proc_fops)
inode->i_fop = de->proc_fops;
}
-out:
return inode;
-out_fail:
+out_ino:
+ if (de != NULL)
+ module_put(de->owner);
+out_mod:
de_put(de);
- goto out;
+ return NULL;
}
int proc_fill_super(struct super_block *s, void *data, int silent)
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a3453555a94..5b6b0b6038a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -629,12 +629,4 @@ void __init proc_misc_init(void)
if (entry)
entry->proc_fops = &proc_sysrq_trigger_operations;
#endif
-#ifdef CONFIG_PPC32
- {
- extern struct file_operations ppc_htab_operations;
- entry = create_proc_entry("ppc_htab", S_IRUGO|S_IWUSR, NULL);
- if (entry)
- entry->proc_fops = &ppc_htab_operations;
- }
-#endif
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c7ef3e48e35..d2fa42006d8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
char *task_mem(struct mm_struct *mm, char *buffer)
{
unsigned long data, text, lib;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+ /*
+ * Note: to minimize their overhead, mm maintains hiwater_vm and
+ * hiwater_rss only when about to *lower* total_vm or rss. Any
+ * collector of these hiwater stats must therefore get total_vm
+ * and rss too, which will usually be the higher. Barriers? not
+ * worth the effort, such snapshots can always be inconsistent.
+ */
+ hiwater_vm = total_vm = mm->total_vm;
+ if (hiwater_vm < mm->hiwater_vm)
+ hiwater_vm = mm->hiwater_vm;
+ hiwater_rss = total_rss = get_mm_rss(mm);
+ if (hiwater_rss < mm->hiwater_rss)
+ hiwater_rss = mm->hiwater_rss;
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
buffer += sprintf(buffer,
+ "VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n"
+ "VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n",
- (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+ hiwater_vm << (PAGE_SHIFT-10),
+ (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
- get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+ hiwater_rss << (PAGE_SHIFT-10),
+ total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm)
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- int rss = get_mm_counter(mm, rss);
-
- *shared = rss - get_mm_counter(mm, anon_rss);
+ *shared = get_mm_counter(mm, file_rss);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = rss;
+ *resident = *shared + get_mm_counter(mm, anon_rss);
return mm->total_vm;
}
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct mem_size_stats *mss)
{
pte_t *pte, ptent;
+ spinlock_t *ptl;
unsigned long pfn;
struct page *page;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
ptent = *pte;
- if (pte_none(ptent) || !pte_present(ptent))
+ if (!pte_present(ptent))
continue;
mss->resident += PAGE_SIZE;
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
mss->private_clean += PAGE_SIZE;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- cond_resched_lock(&vma->vm_mm->page_table_lock);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
}
static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
- struct mm_struct *mm = vma->vm_mm;
struct mem_size_stats mss;
memset(&mss, 0, sizeof mss);
-
- if (mm) {
- spin_lock(&mm->page_table_lock);
+ if (vma->vm_mm)
smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
- spin_unlock(&mm->page_table_lock);
- }
-
return show_map_internal(m, v, &mss);
}
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
for_each_node(i)
md->node[i] =0;
- spin_lock(&mm->page_table_lock);
for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
page = follow_page(mm, vaddr, 0);
if (page) {
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
md->anon++;
md->node[page_to_nid(page)]++;
}
+ cond_resched();
}
- spin_unlock(&mm->page_table_lock);
return md;
}
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_printf(m, " interleave={");
first = 1;
for_each_node(n) {
- if (test_bit(n, pol->v.nodes)) {
+ if (node_isset(n, pol->v.nodes)) {
if (!first)
seq_putc(m,',');
else
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 2706e2adffa..45829889dcd 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2022,7 +2022,7 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
}
#ifdef CONFIG_REISERFS_CHECK
-void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s)
+void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s)
{
void *vp;
static size_t malloced;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d76ee6c4f9b..5f82352b97e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2842,7 +2842,7 @@ static int reiserfs_set_page_dirty(struct page *page)
* even in -o notail mode, we can't be sure an old mount without -o notail
* didn't create files with tails.
*/
-static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
+static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
{
struct inode *inode = page->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 44b02fc02eb..42afb5bef11 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1024,12 +1024,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
*mount_options |= 1 << REISERFS_QUOTA;
} else {
- if (REISERFS_SB(s)->s_qf_names[qtype]) {
- kfree(REISERFS_SB(s)->
- s_qf_names[qtype]);
- REISERFS_SB(s)->s_qf_names[qtype] =
- NULL;
- }
+ kfree(REISERFS_SB(s)->s_qf_names[qtype]);
+ REISERFS_SB(s)->s_qf_names[qtype] = NULL;
}
}
if (c == 'f') {
@@ -1158,11 +1154,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (!reiserfs_parse_options
(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++)
- if (REISERFS_SB(s)->s_qf_names[i]) {
- kfree(REISERFS_SB(s)->s_qf_names[i]);
- REISERFS_SB(s)->s_qf_names[i] = NULL;
- }
+ for (i = 0; i < MAXQUOTAS; i++) {
+ kfree(REISERFS_SB(s)->s_qf_names[i]);
+ REISERFS_SB(s)->s_qf_names[i] = NULL;
+ }
#endif
return -EINVAL;
}
@@ -1940,13 +1935,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
brelse(SB_BUFFER_WITH_SB(s));
#ifdef CONFIG_QUOTA
for (j = 0; j < MAXQUOTAS; j++) {
- if (sbi->s_qf_names[j])
- kfree(sbi->s_qf_names[j]);
+ kfree(sbi->s_qf_names[j]);
+ sbi->s_qf_names[j] = NULL;
}
#endif
- if (sbi != NULL) {
- kfree(sbi);
- }
+ kfree(sbi);
s->s_fs_info = NULL;
return errval;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 87ac9dc8b38..72e12079867 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -453,7 +453,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
struct page *page;
/* We can deadlock if we try to free dentries,
and an unlink/rmdir has just occured - GFP_NOFS avoids this */
- mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS;
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
page = read_cache_page(mapping, n,
(filler_t *) mapping->a_ops->readpage, NULL);
if (!IS_ERR(page)) {
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6703efa3c43..a47ac9aac8b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -296,8 +296,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- if (value)
- kfree(value);
+ kfree(value);
if (!error) {
/* Release the old one */
diff --git a/fs/super.c b/fs/super.c
index 6e57ee252e1..f60155ec778 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -513,7 +513,7 @@ static void mark_files_ro(struct super_block *sb)
struct file *f;
file_list_lock();
- list_for_each_entry(f, &sb->s_files, f_list) {
+ list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f))
f->f_mode &= ~FMODE_WRITE;
}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 1c6f6b57ef1..ef46939c0c1 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,8 +621,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
}
/* build the entry of long file name */
- for (cksum = i = 0; i < 11; i++)
- cksum = (((cksum&1)<<7)|((cksum&0xfe)>>1)) + msdos_name[i];
+ cksum = fat_checksum(msdos_name);
*nr_slots = usize / 13;
for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
@@ -888,10 +887,10 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct buffer_head *dotdot_bh;
struct msdos_dir_entry *dotdot_de;
- loff_t dotdot_i_pos;
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec ts;
+ loff_t dotdot_i_pos, new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -914,31 +913,24 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
ts = CURRENT_TIME_SEC;
if (new_inode) {
- err = vfat_find(new_dir, &new_dentry->d_name, &sinfo);
- if (err)
- goto out;
- if (MSDOS_I(new_inode)->i_pos != sinfo.i_pos) {
- /* WTF??? Cry and fail. */
- printk(KERN_WARNING "vfat_rename: fs corrupted\n");
- goto out;
- }
-
if (is_dir) {
err = fat_dir_empty(new_inode);
if (err)
goto out;
}
+ new_i_pos = MSDOS_I(new_inode)->i_pos;
fat_detach(new_inode);
} else {
err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
&ts, &sinfo);
if (err)
goto out;
+ new_i_pos = sinfo.i_pos;
}
new_dir->i_version++;
fat_detach(old_inode);
- fat_attach(old_inode, sinfo.i_pos);
+ fat_attach(old_inode, new_i_pos);
if (IS_DIRSYNC(new_dir)) {
err = fat_sync_inode(old_inode);
if (err)
@@ -1002,7 +994,7 @@ error_inode:
fat_detach(old_inode);
fat_attach(old_inode, old_sinfo.i_pos);
if (new_inode) {
- fat_attach(new_inode, sinfo.i_pos);
+ fat_attach(new_inode, new_i_pos);
if (corrupt)
corrupt |= fat_sync_inode(new_inode);
} else {
diff --git a/fs/xattr.c b/fs/xattr.c
index 3f9c64bea15..f6e00c0e114 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -143,7 +143,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
if (size) {
if (size > XATTR_SIZE_MAX)
size = XATTR_SIZE_MAX;
- kvalue = kmalloc(size, GFP_KERNEL);
+ kvalue = kzalloc(size, GFP_KERNEL);
if (!kvalue)
return -ENOMEM;
}
@@ -154,11 +154,15 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
error = -EOPNOTSUPP;
if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
- else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
- sizeof XATTR_SECURITY_PREFIX - 1)) {
+
+ if (!strncmp(kname, XATTR_SECURITY_PREFIX,
+ sizeof XATTR_SECURITY_PREFIX - 1)) {
const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
- error = security_inode_getsecurity(d->d_inode, suffix, kvalue,
- size);
+ int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue,
+ size, error);
+ /* Security module active: overwrite error value */
+ if (rv != -EOPNOTSUPP)
+ error = rv;
}
if (error > 0) {
if (size && copy_to_user(value, kvalue, error))
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d2653b589b1..3c92162dc72 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,11 +45,11 @@
void *
-kmem_alloc(size_t size, gfp_t flags)
+kmem_alloc(size_t size, unsigned int __nocast flags)
{
- int retries = 0;
- unsigned int lflags = kmem_flags_convert(flags);
- void *ptr;
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
do {
if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, gfp_t flags)
}
void *
-kmem_zalloc(size_t size, gfp_t flags)
+kmem_zalloc(size_t size, unsigned int __nocast flags)
{
void *ptr;
@@ -90,7 +90,7 @@ kmem_free(void *ptr, size_t size)
void *
kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
- gfp_t flags)
+ unsigned int __nocast flags)
{
void *new;
@@ -105,11 +105,11 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
}
void *
-kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags)
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
{
- int retries = 0;
- unsigned int lflags = kmem_flags_convert(flags);
- void *ptr;
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
do {
ptr = kmem_cache_alloc(zone, lflags);
@@ -124,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags)
}
void *
-kmem_zone_zalloc(kmem_zone_t *zone, gfp_t flags)
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
{
void *ptr;
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index ee7010f085b..f4bb78c268c 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -81,9 +81,9 @@ typedef unsigned long xfs_pflags_t;
*(NSTATEP) = *(OSTATEP); \
} while (0)
-static __inline unsigned int kmem_flags_convert(gfp_t flags)
+static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags)
{
- unsigned int lflags = __GFP_NOWARN; /* we'll report problems, if need be */
+ gfp_t lflags = __GFP_NOWARN; /* we'll report problems, if need be */
#ifdef DEBUG
if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
@@ -125,16 +125,16 @@ kmem_zone_destroy(kmem_zone_t *zone)
BUG();
}
-extern void *kmem_zone_zalloc(kmem_zone_t *, gfp_t);
-extern void *kmem_zone_alloc(kmem_zone_t *, gfp_t);
+extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_alloc(size_t, gfp_t);
-extern void *kmem_realloc(void *, size_t, size_t, gfp_t);
-extern void *kmem_zalloc(size_t, gfp_t);
+extern void *kmem_alloc(size_t, unsigned int __nocast);
+extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_zalloc(size_t, unsigned int __nocast);
extern void kmem_free(void *, size_t);
typedef struct shrinker *kmem_shaker_t;
-typedef int (*kmem_shake_func_t)(int, unsigned int);
+typedef int (*kmem_shake_func_t)(int, gfp_t);
static __inline kmem_shaker_t
kmem_shake_register(kmem_shake_func_t sfunc)
@@ -149,7 +149,7 @@ kmem_shake_deregister(kmem_shaker_t shrinker)
}
static __inline int
-kmem_shake_allow(unsigned int gfp_mask)
+kmem_shake_allow(gfp_t gfp_mask)
{
return (gfp_mask & __GFP_WAIT);
}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c6c077978fe..7aa39872470 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1296,7 +1296,7 @@ linvfs_invalidate_page(
STATIC int
linvfs_release_page(
struct page *page,
- int gfp_mask)
+ gfp_t gfp_mask)
{
struct inode *inode = page->mapping->host;
int dirty, delalloc, unmapped, unwritten;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e82cf72ac59..4cd46abe843 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -64,7 +64,7 @@
STATIC kmem_cache_t *pagebuf_zone;
STATIC kmem_shaker_t pagebuf_shake;
-STATIC int xfsbufd_wakeup(int, unsigned int);
+STATIC int xfsbufd_wakeup(int, gfp_t);
STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
STATIC struct workqueue_struct *xfslogd_workqueue;
@@ -181,8 +181,9 @@ set_page_region(
size_t offset,
size_t length)
{
- page->private |= page_region_mask(offset, length);
- if (page->private == ~0UL)
+ set_page_private(page,
+ page_private(page) | page_region_mask(offset, length));
+ if (page_private(page) == ~0UL)
SetPageUptodate(page);
}
@@ -194,7 +195,7 @@ test_page_region(
{
unsigned long mask = page_region_mask(offset, length);
- return (mask && (page->private & mask) == mask);
+ return (mask && (page_private(page) & mask) == mask);
}
/*
@@ -383,7 +384,7 @@ _pagebuf_lookup_pages(
size_t blocksize = bp->pb_target->pbr_bsize;
size_t size = bp->pb_count_desired;
size_t nbytes, offset;
- int gfp_mask = pb_to_gfp(flags);
+ gfp_t gfp_mask = pb_to_gfp(flags);
unsigned short page_count, i;
pgoff_t first;
loff_t end;
@@ -1749,8 +1750,8 @@ STATIC int xfsbufd_force_sleep;
STATIC int
xfsbufd_wakeup(
- int priority,
- unsigned int mask)
+ int priority,
+ gfp_t mask)
{
if (xfsbufd_force_sleep)
return 0;