summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c1
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/Makefile3
-rw-r--r--fs/binfmt_aout.c54
-rw-r--r--fs/binfmt_elf.c153
-rw-r--r--fs/binfmt_elf_fdpic.c8
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/ioctl.c8
-rw-r--r--fs/ceph/mds_client.c3
-rw-r--r--fs/ceph/super.c37
-rw-r--r--fs/cifs/file.c1
-rw-r--r--fs/compat_binfmt_elf.c7
-rw-r--r--fs/coredump.c20
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c21
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/exofs/sys.c7
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/ext4.h49
-rw-r--r--fs/ext4/extents.c258
-rw-r--r--fs/ext4/file.c8
-rw-r--r--fs/ext4/fsync.c92
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c83
-rw-r--r--fs/ext4/ioctl.c22
-rw-r--r--fs/ext4/mballoc.c129
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/move_extent.c520
-rw-r--r--fs/ext4/namei.c105
-rw-r--r--fs/ext4/page-io.c176
-rw-r--r--fs/ext4/resize.c432
-rw-r--r--fs/ext4/super.c92
-rw-r--r--fs/fat/Makefile2
-rw-r--r--fs/fat/cache.c10
-rw-r--r--fs/fat/dir.c56
-rw-r--r--fs/fat/fat.h97
-rw-r--r--fs/fat/fatent.c13
-rw-r--r--fs/fat/inode.c187
-rw-r--r--fs/fat/namei_msdos.c7
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fat/nfs.c101
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fs-writeback.c8
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/hpfs/anode.c6
-rw-r--r--fs/hpfs/dnode.c28
-rw-r--r--fs/hugetlbfs/inode.c11
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd/commit.c45
-rw-r--r--fs/jbd/transaction.c64
-rw-r--r--fs/jbd2/commit.c40
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c65
-rw-r--r--fs/jffs2/readinode.c13
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nilfs2/file.c3
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/omfs/file.c5
-rw-r--r--fs/proc/base.c117
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_sysctl.c8
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/ftrace.c96
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c9
-rw-r--r--fs/pstore/ram.c28
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/udf/file.c9
-rw-r--r--fs/udf/inode.c59
-rw-r--r--fs/xfs/xfs_file.c2
86 files changed, 2257 insertions, 1328 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index dd6f7ee1e31..c2483e97bee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
static const struct vm_operations_struct v9fs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = v9fs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 02257420274..0efd1524b97 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -164,3 +164,11 @@ config BINFMT_MISC
You may say M here for module support and later load the module when
you have use for it; the module is called binfmt_misc. If you
don't know what to answer at this point, say Y.
+
+config COREDUMP
+ bool "Enable core dump support" if EXPERT
+ default y
+ help
+ This option enables support for performing core dumps. You almost
+ certainly want to say Y here. Not necessary on systems that never
+ need debugging or only ever run flawless code.
diff --git a/fs/Makefile b/fs/Makefile
index 8938f825032..1d7af79288a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o coredump.o
+ stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
+obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_FHANDLE) += fhandle.o
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index d146e181d10..0e7a6f81ae3 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -32,31 +32,8 @@
static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
static int load_aout_library(struct file*);
-static int aout_core_dump(struct coredump_params *cprm);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
-};
-
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
- }
- return 0;
-}
+#ifdef CONFIG_COREDUMP
/*
* Routine writes a core dump image in the current directory.
* Currently only a stub-function.
@@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end)
* field, which also makes sure the core-dumps won't be recursive if the
* dumping of the process results in another error..
*/
-
static int aout_core_dump(struct coredump_params *cprm)
{
struct file *file = cprm->file;
@@ -89,7 +65,7 @@ static int aout_core_dump(struct coredump_params *cprm)
current->flags |= PF_DUMPCORE;
strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->signr;
+ dump.signal = cprm->siginfo->si_signo;
aout_dump_thread(cprm->regs, &dump);
/* If the size of the dump file exceeds the rlimit, then see what would happen
@@ -135,6 +111,32 @@ end_coredump:
set_fs(fs);
return has_dumped;
}
+#else
+#define aout_core_dump NULL
+#endif
+
+static struct linux_binfmt aout_format = {
+ .module = THIS_MODULE,
+ .load_binary = load_aout_binary,
+ .load_shlib = load_aout_library,
+ .core_dump = aout_core_dump,
+ .min_coredump = PAGE_SIZE
+};
+
+#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+ start = PAGE_ALIGN(start);
+ end = PAGE_ALIGN(end);
+ if (end > start) {
+ unsigned long addr;
+ addr = vm_brk(start, end - start);
+ if (BAD_ADDR(addr))
+ return addr;
+ }
+ return 0;
+}
/*
* create_aout_tables() parses the env- and arg-strings in new user
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0225fddf49b..e800dec958c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,6 +27,7 @@
#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/random.h>
#include <linux/elf.h>
@@ -37,6 +38,13 @@
#include <asm/page.h>
#include <asm/exec.h>
+#ifndef user_long_t
+#define user_long_t long
+#endif
+#ifndef user_siginfo_t
+#define user_siginfo_t siginfo_t
+#endif
+
static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
static int load_elf_library(struct file *);
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
@@ -881,7 +889,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
}
if (elf_interpreter) {
- unsigned long uninitialized_var(interp_map_addr);
+ unsigned long interp_map_addr = 0;
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
@@ -1115,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (always_dump_vma(vma))
goto whole;
- if (vma->vm_flags & VM_NODUMP)
+ if (vma->vm_flags & VM_DONTDUMP)
return 0;
/* Hugetlb memory check */
@@ -1127,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
}
/* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & (VM_IO | VM_RESERVED))
+ if (vma->vm_flags & VM_IO)
return 0;
/* By default, dump shared memory if mapped from an anonymous file. */
@@ -1372,6 +1380,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
}
+static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
+ siginfo_t *siginfo)
+{
+ mm_segment_t old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
+ set_fs(old_fs);
+ fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+}
+
+#define MAX_FILE_NOTE_SIZE (4*1024*1024)
+/*
+ * Format of NT_FILE note:
+ *
+ * long count -- how many files are mapped
+ * long page_size -- units for file_ofs
+ * array of [COUNT] elements of
+ * long start
+ * long end
+ * long file_ofs
+ * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
+ */
+static void fill_files_note(struct memelfnote *note)
+{
+ struct vm_area_struct *vma;
+ unsigned count, size, names_ofs, remaining, n;
+ user_long_t *data;
+ user_long_t *start_end_ofs;
+ char *name_base, *name_curpos;
+
+ /* *Estimated* file count and total data size needed */
+ count = current->mm->map_count;
+ size = count * 64;
+
+ names_ofs = (2 + 3 * count) * sizeof(data[0]);
+ alloc:
+ if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+ goto err;
+ size = round_up(size, PAGE_SIZE);
+ data = vmalloc(size);
+ if (!data)
+ goto err;
+
+ start_end_ofs = data + 2;
+ name_base = name_curpos = ((char *)data) + names_ofs;
+ remaining = size - names_ofs;
+ count = 0;
+ for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ struct file *file;
+ const char *filename;
+
+ file = vma->vm_file;
+ if (!file)
+ continue;
+ filename = d_path(&file->f_path, name_curpos, remaining);
+ if (IS_ERR(filename)) {
+ if (PTR_ERR(filename) == -ENAMETOOLONG) {
+ vfree(data);
+ size = size * 5 / 4;
+ goto alloc;
+ }
+ continue;
+ }
+
+ /* d_path() fills at the end, move name down */
+ /* n = strlen(filename) + 1: */
+ n = (name_curpos + remaining) - filename;
+ remaining = filename - name_curpos;
+ memmove(name_curpos, filename, n);
+ name_curpos += n;
+
+ *start_end_ofs++ = vma->vm_start;
+ *start_end_ofs++ = vma->vm_end;
+ *start_end_ofs++ = vma->vm_pgoff;
+ count++;
+ }
+
+ /* Now we know exact count of files, can store it */
+ data[0] = count;
+ data[1] = PAGE_SIZE;
+ /*
+ * Count usually is less than current->mm->map_count,
+ * we need to move filenames down.
+ */
+ n = current->mm->map_count - count;
+ if (n != 0) {
+ unsigned shift_bytes = n * 3 * sizeof(data[0]);
+ memmove(name_base - shift_bytes, name_base,
+ name_curpos - name_base);
+ name_curpos -= shift_bytes;
+ }
+
+ size = name_curpos - (char *)data;
+ fill_note(note, "CORE", NT_FILE, size, data);
+ err: ;
+}
+
#ifdef CORE_DUMP_USE_REGSET
#include <linux/regset.h>
@@ -1385,7 +1490,10 @@ struct elf_thread_core_info {
struct elf_note_info {
struct elf_thread_core_info *thread;
struct memelfnote psinfo;
+ struct memelfnote signote;
struct memelfnote auxv;
+ struct memelfnote files;
+ user_siginfo_t csigdata;
size_t size;
int thread_notes;
};
@@ -1480,7 +1588,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+ siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1550,7 +1658,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, signr, &info->size))
+ if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
return 0;
/*
@@ -1559,9 +1667,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ info->size += notesize(&info->signote);
+
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
+ fill_files_note(&info->files);
+ info->size += notesize(&info->files);
+
return 1;
}
@@ -1588,8 +1702,12 @@ static int write_note_info(struct elf_note_info *info,
if (first && !writenote(&info->psinfo, file, foffset))
return 0;
+ if (first && !writenote(&info->signote, file, foffset))
+ return 0;
if (first && !writenote(&info->auxv, file, foffset))
return 0;
+ if (first && !writenote(&info->files, file, foffset))
+ return 0;
for (i = 1; i < info->thread_notes; ++i)
if (t->notes[i].data &&
@@ -1616,6 +1734,7 @@ static void free_note_info(struct elf_note_info *info)
kfree(t);
}
kfree(info->psinfo.data);
+ vfree(info->files.data);
}
#else
@@ -1681,6 +1800,7 @@ struct elf_note_info {
#ifdef ELF_CORE_COPY_XFPREGS
elf_fpxregset_t *xfpu;
#endif
+ user_siginfo_t csigdata;
int thread_status_size;
int numnote;
};
@@ -1690,8 +1810,8 @@ static int elf_note_info_init(struct elf_note_info *info)
memset(info, 0, sizeof(*info));
INIT_LIST_HEAD(&info->thread_list);
- /* Allocate space for six ELF notes */
- info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
+ /* Allocate space for ELF notes */
+ info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL);
if (!info->notes)
return 0;
info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
@@ -1713,14 +1833,14 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+ siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
if (!elf_note_info_init(info))
return 0;
- if (signr) {
+ if (siginfo->si_signo) {
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -1738,13 +1858,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
int sz;
ets = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(signr, ets);
+ sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(info->prstatus, current, signr);
+ fill_prstatus(info->prstatus, current, siginfo->si_signo);
elf_core_copy_regs(&info->prstatus->pr_reg, regs);
/* Set up header */
@@ -1761,9 +1881,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- info->numnote = 2;
+ fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_auxv_note(info->notes + 3, current->mm);
+ fill_files_note(info->notes + 4);
- fill_auxv_note(&info->notes[info->numnote++], current->mm);
+ info->numnote = 5;
/* Try to dump the FPU. */
info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1825,6 +1947,9 @@ static void free_note_info(struct elf_note_info *info)
kfree(list_entry(tmp, struct elf_thread_status, list));
}
+ /* Free data allocated by fill_files_note(): */
+ vfree(info->notes[4].data);
+
kfree(info->prstatus);
kfree(info->psinfo);
kfree(info->notes);
@@ -1951,7 +2076,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
+ if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
goto cleanup;
has_dumped = 1;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3d77cf81ba3..262db114ff0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
int dump_ok;
/* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+ if (vma->vm_flags & VM_IO) {
kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
return 0;
}
@@ -1642,7 +1642,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto cleanup;
#endif
- if (cprm->signr) {
+ if (cprm->siginfo->si_signo) {
struct core_thread *ct;
struct elf_thread_status *tmp;
@@ -1661,13 +1661,13 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
int sz;
tmp = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(cprm->signr, tmp);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
thread_status_size += sz;
}
}
/* now collect the dump for the current */
- fill_prstatus(prstatus, current, cprm->signr);
+ fill_prstatus(prstatus, current, cprm->siginfo->si_signo);
elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
segs = current->mm->map_count;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 178cb70acc2..e280352b28f 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -107,7 +107,7 @@ static struct linux_binfmt flat_format = {
static int flat_core_dump(struct coredump_params *cprm)
{
printk("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, (int) cprm->signr);
+ current->comm, current->pid, (int) cprm->siginfo->si_signo);
return(1);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285c6e4..f6b40e86121 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1599,6 +1599,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
file_accessed(filp);
vma->vm_ops = &btrfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 58e2e7b7737..b5f044283ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t size;
int ret;
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
sb_start_pagefault(sb);
+
+ /*
+ * Update file times before taking page lock. We may end up failing the
+ * fault so this update may be superfluous but who really cares...
+ */
+ file_update_time(vma->vm_file);
+
ret = __block_page_mkwrite(vma, vmf, get_block);
sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 452e71a1b75..6690269f5dd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -205,7 +205,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- page->index << PAGE_CACHE_SHIFT, &len,
+ (u64) page_offset(page), &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
if (err == -ENOENT)
@@ -286,7 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
int nr_pages = 0;
int ret;
- off = page->index << PAGE_CACHE_SHIFT;
+ off = (u64) page_offset(page);
/* count pages */
next_index = page->index;
@@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
NULL, 0,
ci->i_truncate_seq, ci->i_truncate_size,
NULL, false, 1, 0);
- if (!req)
- return -ENOMEM;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
/* build page vector */
nr_pages = len >> PAGE_CACHE_SHIFT;
@@ -426,7 +426,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
- loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+ loff_t page_off = page_offset(page);
int len = PAGE_CACHE_SIZE;
loff_t i_size;
int err = 0;
@@ -817,8 +817,7 @@ get_more_pages:
/* ok */
if (locked_pages == 0) {
/* prepare async write request */
- offset = (unsigned long long)page->index
- << PAGE_CACHE_SHIFT;
+ offset = (u64) page_offset(page);
len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout,
@@ -832,8 +831,8 @@ get_more_pages:
ci->i_truncate_size,
&inode->i_mtime, true, 1, 0);
- if (!req) {
- rc = -ENOMEM;
+ if (IS_ERR(req)) {
+ rc = PTR_ERR(req);
unlock_page(page);
break;
}
@@ -1180,7 +1179,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct page *page = vmf->page;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- loff_t off = page->index << PAGE_CACHE_SHIFT;
+ loff_t off = page_offset(page);
loff_t size, len;
int ret;
@@ -1225,6 +1224,7 @@ out:
static struct vm_operations_struct ceph_vmops = {
.fault = filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1235,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ceph_vmops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 620daad201d..3251e9cc640 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1005,7 +1005,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
head = msg->front.iov_base;
- head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+ le32_add_cpu(&head->num, 1);
item = msg->front.iov_base + msg->front.iov_len;
item->ino = cpu_to_le64(ino);
item->cap_id = cpu_to_le64(cap_id);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ecebbc09bfc..5840d2aaed1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -536,8 +536,8 @@ more:
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
&mtime, false, 2, page_align);
- if (!req)
- return -ENOMEM;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
if (file->f_flags & O_DIRECT) {
pages = ceph_get_direct_page_vector(data, num_pages, false);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 1396ceb4679..36549a46e31 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -187,14 +187,18 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
u64 tmp;
struct ceph_object_layout ol;
struct ceph_pg pgid;
+ int r;
/* copy and validate */
if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT;
down_read(&osdc->map_sem);
- ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
- &dl.object_no, &dl.object_offset, &olen);
+ r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+ &dl.object_no, &dl.object_offset,
+ &olen);
+ if (r < 0)
+ return -EIO;
dl.file_offset -= dl.object_offset;
dl.object_size = ceph_file_layout_object_size(ci->i_layout);
dl.block_size = ceph_file_layout_su(ci->i_layout);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a5a735422aa..1bcf712655d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2625,7 +2625,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
session_state_name(s->s_state));
- if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+ if (i >= newmap->m_max_mds ||
+ memcmp(ceph_mdsmap_get_addr(oldmap, i),
ceph_mdsmap_get_addr(newmap, i),
sizeof(struct ceph_entity_addr))) {
if (s->s_state == CEPH_MDS_SESSION_OPENING) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3a42d932637..2eb43f21132 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -307,7 +307,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
{
struct ceph_mount_options *fsopt;
const char *dev_name_end;
- int err = -ENOMEM;
+ int err;
+
+ if (!dev_name || !*dev_name)
+ return -EINVAL;
fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
if (!fsopt)
@@ -328,21 +331,33 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ /*
+ * Distinguish the server list from the path in "dev_name".
+ * Internally we do not include the leading '/' in the path.
+ *
+ * "dev_name" will look like:
+ * <server_spec>[,<server_spec>...]:[<path>]
+ * where
+ * <server_spec> is <ip>[:<port>]
+ * <path> is optional, but if present must begin with '/'
+ */
+ dev_name_end = strchr(dev_name, '/');
+ if (dev_name_end) {
+ /* skip over leading '/' for path */
+ *path = dev_name_end + 1;
+ } else {
+ /* path is empty */
+ dev_name_end = dev_name + strlen(dev_name);
+ *path = dev_name_end;
+ }
err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name_end--; /* back up to ':' separator */
+ if (*dev_name_end != ':') {
+ pr_err("device name is missing path (no : separator in %s)\n",
dev_name);
goto out;
}
- dev_name_end = *path;
dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-
- /* path on server */
- *path += 2;
dout("server path '%s'\n", *path);
*popt = ceph_parse_options(options, dev_name, dev_name_end,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7d7bbdc4c8e..edb25b4bbb9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = cifs_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 112e45a17e9..a81147e2e4e 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -38,6 +38,13 @@
#define elf_addr_t Elf32_Addr
/*
+ * Some data types as stored in coredump.
+ */
+#define user_long_t compat_long_t
+#define user_siginfo_t compat_siginfo_t
+#define copy_siginfo_to_user copy_siginfo_to_user32
+
+/*
* The machine-dependent core note format types are defined in elfcore-compat.h,
* which requires asm/elf.h to define compat_elf_gregset_t et al.
*/
diff --git a/fs/coredump.c b/fs/coredump.c
index f045bbad682..fd37facac8d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -14,6 +14,7 @@
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
@@ -39,6 +40,7 @@
#include <trace/events/task.h>
#include "internal.h"
+#include "coredump.h"
#include <trace/events/sched.h>
@@ -147,7 +149,7 @@ put_exe_file:
* name into corename, which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
*/
-static int format_corename(struct core_name *cn, long signr)
+static int format_corename(struct core_name *cn, struct coredump_params *cprm)
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
@@ -192,9 +194,13 @@ static int format_corename(struct core_name *cn, long signr)
case 'g':
err = cn_printf(cn, "%d", cred->gid);
break;
+ case 'd':
+ err = cn_printf(cn, "%d",
+ __get_dumpable(cprm->mm_flags));
+ break;
/* signal that caused the coredump */
case 's':
- err = cn_printf(cn, "%ld", signr);
+ err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
break;
/* UNIX time of coredump */
case 't': {
@@ -451,7 +457,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return 0;
}
-void do_coredump(long signr, int exit_code, struct pt_regs *regs)
+void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
{
struct core_state core_state;
struct core_name cn;
@@ -466,7 +472,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
bool need_nonrelative = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
- .signr = signr,
+ .siginfo = siginfo,
.regs = regs,
.limit = rlimit(RLIMIT_CORE),
/*
@@ -477,7 +483,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
.mm_flags = mm->flags,
};
- audit_core_dumps(signr);
+ audit_core_dumps(siginfo->si_signo);
binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
@@ -501,7 +507,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
need_nonrelative = true;
}
- retval = coredump_wait(exit_code, &core_state);
+ retval = coredump_wait(siginfo->si_signo, &core_state);
if (retval < 0)
goto fail_creds;
@@ -513,7 +519,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
*/
clear_thread_flag(TIF_SIGPENDING);
- ispipe = format_corename(&cn, signr);
+ ispipe = format_corename(&cn, &cprm);
if (ispipe) {
int dump_count;
diff --git a/fs/coredump.h b/fs/coredump.h
new file mode 100644
index 00000000000..e39ff072110
--- /dev/null
+++ b/fs/coredump.h
@@ -0,0 +1,6 @@
+#ifndef _FS_COREDUMP_H
+#define _FS_COREDUMP_H
+
+extern int __get_dumpable(unsigned long mm_flags);
+
+#endif
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cd96649bfe6..da72250ddc1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
- return op != EPOLL_CTL_DEL;
+ return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
}
/* Initialize the poll safe wake up structure */
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
return 0;
}
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+ int result = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->lock, flags);
+ if (epi->event.events & ~EP_PRIVATE_BITS) {
+ if (ep_is_linked(&epi->rdllink))
+ list_del_init(&epi->rdllink);
+ /* Ensure ep_poll_callback will not add epi back onto ready
+ list: */
+ epi->event.events &= EP_PRIVATE_BITS;
+ }
+ else
+ result = -EBUSY;
+ spin_unlock_irqrestore(&ep->lock, flags);
+
+ return result;
+}
+
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
-
-
#define PATH_ARR_SIZE 5
/*
* These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
error = -ENOENT;
break;
+ case EPOLL_CTL_DISABLE:
+ if (epi)
+ error = ep_disable(ep, epi);
+ else
+ error = -ENOENT;
+ break;
}
mutex_unlock(&ep->mtx);
diff --git a/fs/exec.c b/fs/exec.c
index 48fb26ef8a1..4f2bebc276c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -63,6 +63,7 @@
#include <trace/events/task.h>
#include "internal.h"
+#include "coredump.h"
#include <trace/events/sched.h>
@@ -602,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* process cleanup to remove whatever mess we made.
*/
if (length != move_page_tables(vma, old_start,
- vma, new_start, length))
+ vma, new_start, length, false))
return -ENOMEM;
lru_add_drain();
@@ -877,9 +878,11 @@ static int de_thread(struct task_struct *tsk)
sig->notify_count--;
while (sig->notify_count) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
schedule();
+ if (unlikely(__fatal_signal_pending(tsk)))
+ goto killed;
spin_lock_irq(lock);
}
spin_unlock_irq(lock);
@@ -897,9 +900,11 @@ static int de_thread(struct task_struct *tsk)
write_lock_irq(&tasklist_lock);
if (likely(leader->exit_state))
break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(TASK_KILLABLE);
write_unlock_irq(&tasklist_lock);
schedule();
+ if (unlikely(__fatal_signal_pending(tsk)))
+ goto killed;
}
/*
@@ -993,6 +998,14 @@ no_thread_group:
BUG_ON(!thread_group_leader(tsk));
return 0;
+
+killed:
+ /* protects against exit_notify() and __exit_signal() */
+ read_lock(&tasklist_lock);
+ sig->group_exit_task = NULL;
+ sig->notify_count = 0;
+ read_unlock(&tasklist_lock);
+ return -EAGAIN;
}
char *get_task_comm(char *buf, struct task_struct *tsk)
@@ -1096,7 +1109,7 @@ void setup_new_exec(struct linux_binprm * bprm)
current->sas_ss_sp = current->sas_ss_size = 0;
if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
- set_dumpable(current->mm, 1);
+ set_dumpable(current->mm, SUID_DUMPABLE_ENABLED);
else
set_dumpable(current->mm, suid_dumpable);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 5f376d14fdc..b963f38ac29 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
{
- unsigned p;
+ int p;
for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 5a7b691e748..1b4f2f95fc3 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -80,8 +80,13 @@ static ssize_t uri_show(struct exofs_dev *edp, char *buf)
static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
{
+ uint8_t *new_uri;
+
edp->urilen = strlen(buf) + 1;
- edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+ new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+ if (new_uri == NULL)
+ return -ENOMEM;
+ edp->uri = new_uri;
strncpy(edp->uri, buf, edp->urilen);
return edp->urilen;
}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index bd29894c8fb..17ae5c83d23 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -980,7 +980,7 @@ static int parse_options (char *options, struct super_block *sb,
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
@@ -1484,10 +1484,12 @@ static void ext3_orphan_cleanup (struct super_block * sb,
}
if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4ce2d..3ab2539b7b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
- atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
+ unsigned int s_group_info_size;
/* tunables */
unsigned long s_stripe;
@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_writeback_mb_bump;
+ unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
unsigned long s_sectors_written_start;
u64 s_kbytes_written;
+ /* the size of zero-out chunk */
+ unsigned int s_extent_max_zeroout_kb;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+ ext4_group_t s_flex_groups_allocated;
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
@@ -1345,6 +1358,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+ ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+ ext4_group_t ngroup);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f5368..1c94cca35ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+ le16_add_cpu(&neh->eh_depth, 1);
ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
}
/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ size_t s;
+ unsigned max_root = ext4_ext_space_root(inode, 0);
+ ext4_fsblk_t blk;
+
+ if ((path[0].p_depth != 1) ||
+ (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+ (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+ return;
+
+ /*
+ * We need to modify the block allocation bitmap and the block
+ * group descriptor to release the extent tree block. If we
+ * can't get the journal credits, give up.
+ */
+ if (ext4_journal_extend(handle, 2))
+ return;
+
+ /*
+ * Copy the extent data up to the inode
+ */
+ blk = ext4_idx_pblock(path[0].p_idx);
+ s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+ sizeof(struct ext4_extent_idx);
+ s += sizeof(struct ext4_extent_header);
+
+ memcpy(path[0].p_hdr, path[1].p_hdr, s);
+ path[0].p_depth = 0;
+ path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+ (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+ path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+ brelse(path[1].p_bh);
+ ext4_free_blocks(handle, inode, NULL, blk, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
+/*
* This function tries to merge the @ex extent to neighbours in the tree.
* return 1 if merge left else 0.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static void ext4_ext_try_to_merge(handle_t *handle,
+ struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex) {
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
- int ret = 0;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
if (!merge_done)
- ret = ext4_ext_try_to_merge_right(inode, path, ex);
+ (void) ext4_ext_try_to_merge_right(inode, path, ex);
- return ret;
+ ext4_ext_try_to_merge_up(handle, inode, path);
}
/*
@@ -1893,7 +1937,7 @@ has_space:
merge:
/* try to merge extents */
if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, nearex);
+ ext4_ext_try_to_merge(handle, inode, path, nearex);
/* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
if (err)
goto cleanup;
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
if (npath) {
@@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
}
/*
- * ext4_ext_check_cache()
+ * ext4_ext_in_cache()
* Checks to see if the given block is in the cache.
* If it is, the cached extent is stored in the given
- * cache extent pointer. If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
+ * cache extent pointer.
*
* @inode: The files inode
* @block: The block to look for in the cache
@@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
*
* Return 0 if cache is invalid; 1 if the cache is valid
*/
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
struct ext4_ext_cache *cex;
struct ext4_sb_info *sbi;
int ret = 0;
@@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) {
- memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+ ex->ee_block = cpu_to_le32(cex->ec_block);
+ ext4_ext_store_pblock(ex, cex->ec_start);
+ ex->ee_len = cpu_to_le16(cex->ec_len);
ext_debug("%u cached by %u:%u:%llu\n",
block,
cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2138,37 +2183,6 @@ errout:
}
/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache cex;
- int ret = 0;
-
- if (ext4_ext_check_cache(inode, block, &cex)) {
- ex->ee_block = cpu_to_le32(cex.ec_block);
- ext4_ext_store_pblock(ex, cex.ec_start);
- ex->ee_len = cpu_to_le16(cex.ec_len);
- ret = 1;
- }
-
- return ret;
-}
-
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
@@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = EXT4_FREE_BLOCKS_FORGET;
+ int flags = 0;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
+
/*
* For bigalloc file systems, we never free a partial cluster
* at the beginning of the extent. Instead, we make a note
@@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i = 0, err;
+ int i = 0, err = 0;
ext_debug("truncate since %u to %u\n", start, end);
@@ -2604,12 +2621,16 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
- path = NULL;
- goto cont;
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
}
ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2662,6 @@ again:
goto out;
}
}
-cont:
-
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
@@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_initialized(ex);
if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle,
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
} else if (err)
goto fix_extent_len;
@@ -3041,7 +3060,6 @@ out:
return err ? err : map->m_len;
}
-#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth;
- int allocated;
+ int allocated, max_zeroout = 0;
int err = 0;
int split_flag = 0;
@@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
"block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len);
+ sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
@@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ max_zeroout = sbi->s_extent_max_zeroout_kb >>
+ inode->i_sb->s_blocksize_bits;
+
+ /* If extent is less than s_max_zeroout_kb, zeroout directly */
+ if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
if (err)
goto out;
@@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (err)
goto out;
ext4_ext_mark_initialized(ex);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (allocated > map->m_len) {
- if (allocated <= EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (max_zeroout && (allocated > map->m_len)) {
+ if (allocated <= max_zeroout) {
/* case 3 */
zero_ex.ee_block =
cpu_to_le32(map->m_lblk);
@@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
split_map.m_lblk = map->m_lblk;
split_map.m_len = allocated;
- } else if ((map->m_lblk - ee_block + map->m_len <
- EXT4_EXT_ZERO_LEN) &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
/* case 2 */
if (map->m_lblk != ee_block) {
zero_ex.ee_block = ex->ee_block;
@@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, 0);
+ &split_map, split_flag, 0);
if (allocated < 0)
err = allocated;
@@ -3256,7 +3276,7 @@ out:
* to an uninitialized extent.
*
* Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /initialized uninitialized extents (up to three)
+ * extent into multiple initialized/uninitialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
* b> Splits in two extents: Write is happening at either end of the extent
@@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
return err;
@@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
path, flags);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
@@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4105,8 @@ got_allocated_blocks:
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ set_unwritten = 1;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
}
@@ -4100,6 +4118,15 @@ got_allocated_blocks:
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
* finish any pending end_io work so we won't run the risk of
* converting any truncated blocks to initialized later
*/
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
/*
* probably first extent we're gonna free will be last in block
@@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
- ext4_flush_completed_IO(inode);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ err = ext4_flush_unwritten_io(inode);
+ if (err)
+ goto out_dio;
+ inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
- err = ext4_orphan_add(handle, inode);
- if (err)
- goto out;
/*
* Now we need to zero out the non-page-aligned data in the
@@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3b0e3bdaabf..bf3966bccd3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_aiodio_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
+ ext4_unwritten_wait(inode);
}
BUG_ON(iocb->ki_pos != pos);
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ext4_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2a1dcea4f12..be1d89f385b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
- ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
-
- dump_completed_IO(inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
- list_del_init(&io->list);
- io->flag |= EXT4_IO_END_IN_FSYNC;
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * conversion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- if (ret < 0)
- ret2 = ret;
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- io->flag &= ~EXT4_IO_END_IN_FSYNC;
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
-}
-
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
@@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret;
+ int ret, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (inode->i_sb->s_flags & MS_RDONLY)
goto out;
- ret = ext4_flush_completed_IO(inode);
+ ret = ext4_flush_unwritten_io(inode);
if (ret < 0)
goto out;
@@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
ret = jbd2_log_wait_commit(journal, commit_tid);
- if (needs_barrier)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (needs_barrier) {
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
+ }
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 26154b81b83..fa36372f3fd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -697,6 +697,15 @@ got_group:
if (!gdp)
goto fail;
+ /*
+ * Check free inodes count before loading bitmap.
+ */
+ if (ext4_free_inodes_count(sb, gdp) == 0) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2bf14..792e388e7b4 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+ if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
mutex_lock(&inode->i_mutex);
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
mutex_unlock(&inode->i_mutex);
}
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
} else {
+locked:
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c862ee5fe79..b3c243b9afa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
+ /* ensure we send some value back into *errp */
+ *errp = 0;
+
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
- *errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!bh) {
@@ -1954,9 +1956,6 @@ out:
return ret;
}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
free_blocks = EXT4_C2B(sbi,
percpu_counter_read_positive(&sbi->s_freeclusters_counter));
dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ /*
+ * Start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
+ !writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ }
+
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
@@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
*/
return 1;
}
- /*
- * Even if we don't switch but are nearing capacity,
- * start pushing delalloc when 1/2 of free blocks are dirty.
- */
- if (free_blocks < 2 * dirty_blocks)
- writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
-
return 0;
}
@@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2910,24 +2909,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
@@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
overwrite = *((int *)iocb->private);
if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* hook to the iocb.
*/
iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end =
ext4_init_io_end(inode, GFP_NOFS);
@@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* is a unwritten extents needs to be converted
* when IO is completed.
*/
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ ext4_inode_aio_set(inode, io_end);
}
if (overwrite)
@@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
DIO_LOCKING);
if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
@@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
retake_lock:
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
+ inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+ int need_datasync = 0;
uid_t i_uid;
gid_t i_gid;
@@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
- ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize != ext4_isize(raw_inode)) {
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ need_datasync = 1;
+ }
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
- ext4_update_inode_fsync_trans(handle, inode, 0);
+ ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
+ if (attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
+ /* Inode size will be reduced, wait for dio in flight.
+ * Temporarily disable dioread_nolock to prevent
+ * livelock. */
+ if (orphan) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ }
+ }
ext4_truncate(inode);
}
@@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
@@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int retries = 0;
sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5439d6a56e9..5747f52f7c7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -366,26 +366,11 @@ group_add_out:
return -EOPNOTSUPP;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_META_BG)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not (yet) supported with meta_bg");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
}
- if (n_blocks_count > MAX_32_NUM &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR,
- "File system only supports 32-bit block numbers");
- return -EOPNOTSUPP;
- }
-
err = ext4_resize_begin(sb);
if (err)
return err;
@@ -420,13 +405,6 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
- ext4_msg(sb, KERN_ERR,
- "FITRIM not supported with bigalloc");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 08778f6cdfe..f8b27bf80ac 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -24,6 +24,7 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/debugfs.h>
+#include <linux/log2.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
@@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
+ int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
@@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
@@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
ac->ac_b_ex = ex;
@@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
+ free = grp->bb_free;
+ if (free == 0)
+ return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
- free = grp->bb_free;
fragments = grp->bb_fragments;
- if (free == 0)
- return 0;
if (fragments == 0)
return 0;
@@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
+
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
+
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
+}
+
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
@@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
- memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
@@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- int num_meta_group_infos;
- int num_meta_group_infos_max;
- int array_size;
+ int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
- /* This is the number of blocks used by GDT */
- num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
- 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
- /*
- * This is the total number of blocks used by GDT including
- * the number of reserved blocks for GDT.
- * The s_group_info array is allocated with this value
- * to allow a clean online resize without a complex
- * manipulation of pointer.
- * The drawback is the unused memory when no resize
- * occurs but it's very low in terms of pages
- * (see comments below)
- * Need to handle this properly when META_BG resizing is allowed
- */
- num_meta_group_infos_max = num_meta_group_infos +
- le16_to_cpu(es->s_reserved_gdt_blocks);
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /*
- * array_size is the size of s_group_info array. We round it
- * to the next power of two because this approximation is done
- * internally by kmalloc so we can have some more memory
- * for free here (e.g. may be used for META_BG resize).
- */
- array_size = 1;
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2322,7 +2323,7 @@ err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
- i = num_meta_group_infos;
+ i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- memset(ac, 0, sizeof(struct ext4_allocation_context));
ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
@@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
@@ -4657,6 +4657,8 @@ do_more:
* with group lock held. generate_buddy look at
* them with group lock_held
*/
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, block_group, bit, count);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits;
end = start + (range->len >> sb->s_blocksize_bits) - 1;
- minlen = range->minlen >> sb->s_blocksize_bits;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
unlikely(start >= max_blks))
@@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
out:
- range->len = trimmed * sb->s_blocksize;
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c070618c21c..3ccd889ba95 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
-
-/*
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c623e7..292daeeed45 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_check_null_inode - NULL check for two inodes
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
- const char *function, unsigned int line)
-{
- int ret = 0;
-
- if (inode1 == NULL) {
- __ext4_error(inode2->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 NULL inode2 %lu", inode2->i_ino);
- ret = -EIO;
- } else if (inode2 == NULL) {
- __ext4_error(inode1->i_sb, function, line,
- "Both inodes should not be NULL: "
- "inode1 %lu inode2 NULL", inode1->i_ino);
- ret = -EIO;
- }
- return ret;
-}
-
-/**
* double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes
*/
static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *first, struct inode *second)
{
- struct inode *first = orig_inode, *second = donor_inode;
+ if (first < second) {
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ } else {
+ down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
}
-
- down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
@@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
- tmp_dext->ee_block =
- cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
- tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+ le32_add_cpu(&tmp_dext->ee_block, diff);
+ le16_add_cpu(&tmp_dext->ee_len, -diff);
if (max_count < ext4_ext_get_actual_len(tmp_dext))
tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
}
/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode: inode in question
+ * @from: block offset of inode
+ * @count: block count to be checked
+ * @uninit: extents expected to be uninitialized
+ * @err: pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+ int uninit, int *err)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ext;
+ ext4_lblk_t last = from + count;
+ while (from < last) {
+ *err = get_ext_path(inode, from, &path);
+ if (*err)
+ return 0;
+ ext = path[ext_depth(inode)].p_ext;
+ if (!ext) {
+ ext4_ext_drop_refs(path);
+ return 0;
+ }
+ if (uninit != ext4_ext_is_uninitialized(ext)) {
+ ext4_ext_drop_refs(path);
+ return 0;
+ }
+ from += ext4_ext_get_actual_len(ext);
+ ext4_ext_drop_refs(path);
+ }
+ return 1;
+}
+
+/**
* mext_replace_branches - Replace original extents with new extents
*
* @handle: journal handle
@@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
int replaced_count = 0;
int dext_alen;
- /* Protect extent trees against block allocations via delalloc */
- double_down_write_data_sem(orig_inode, donor_inode);
-
/* Get the original extent for the block "orig_off" */
*err = get_ext_path(orig_inode, orig_off, &orig_path);
if (*err)
@@ -764,12 +763,122 @@ out:
ext4_ext_invalidate_cache(orig_inode);
ext4_ext_invalidate_cache(donor_inode);
- double_up_write_data_sem(orig_inode, donor_inode);
-
return replaced_count;
}
/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ * @index: page index
+ * @page: result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+ pgoff_t index, struct page *page[2])
+{
+ struct address_space *mapping[2];
+ unsigned fl = AOP_FLAG_NOFS;
+
+ BUG_ON(!inode1 || !inode2);
+ if (inode1 < inode2) {
+ mapping[0] = inode1->i_mapping;
+ mapping[1] = inode2->i_mapping;
+ } else {
+ mapping[0] = inode2->i_mapping;
+ mapping[1] = inode1->i_mapping;
+ }
+
+ page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+ if (!page[0])
+ return -ENOMEM;
+
+ page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+ if (!page[1]) {
+ unlock_page(page[0]);
+ page_cache_release(page[0]);
+ return -ENOMEM;
+ }
+
+ if (inode1 > inode2) {
+ struct page *tmp;
+ tmp = page[0];
+ page[0] = page[1];
+ page[1] = tmp;
+ }
+ return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ sector_t block;
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ unsigned int blocksize, block_start, block_end;
+ int i, err, nr = 0, partial = 0;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (PageUptodate(page))
+ return 0;
+
+ blocksize = 1 << inode->i_blkbits;
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ head = page_buffers(page);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ continue;
+ }
+ if (buffer_uptodate(bh))
+ continue;
+ if (!buffer_mapped(bh)) {
+ int err = 0;
+ err = ext4_get_block(inode, block, bh, 0);
+ if (err) {
+ SetPageError(page);
+ return err;
+ }
+ if (!buffer_mapped(bh)) {
+ zero_user(page, block_start, blocksize);
+ if (!err)
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+ BUG_ON(nr >= MAX_BUF_PER_PAGE);
+ arr[nr++] = bh;
+ }
+ /* No io required */
+ if (!nr)
+ goto out;
+
+ for (i = 0; i < nr; i++) {
+ bh = arr[i];
+ if (!bh_uptodate_or_lock(bh)) {
+ err = bh_submit_read(bh);
+ if (err)
+ return err;
+ }
+ }
+out:
+ if (!partial)
+ SetPageUptodate(page);
+ return 0;
+}
+
+/**
* move_extent_per_page - Move extent data per page
*
* @o_filp: file structure of original file
@@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
- struct address_space *mapping = orig_inode->i_mapping;
- struct buffer_head *bh;
- struct page *page = NULL;
- const struct address_space_operations *a_ops = mapping->a_ops;
+ struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
- void *fsdata;
- int i, jblocks;
- int err2 = 0;
+ int err2, jblocks, retries = 0;
int replaced_count = 0;
+ int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
* It needs twice the amount of ordinary journal buffers because
* inode and donor_inode may change each different metadata blocks.
*/
+again:
+ *err = 0;
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
@@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- /*
- * If orig extent is uninitialized one,
- * it's not necessary force the page into memory
- * and then force it to be written out again.
- * Just swap data blocks between orig and donor.
- */
- if (uninit) {
- replaced_count = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page, err);
- goto out2;
- }
-
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
/* Calculate data_size */
@@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
replaced_size = data_size;
- *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
- &page, &fsdata);
+ *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+ pagep);
if (unlikely(*err < 0))
- goto out;
-
- if (!PageUptodate(page)) {
- mapping->a_ops->readpage(o_filp, page);
- lock_page(page);
- }
-
+ goto stop_journal;
/*
- * try_to_release_page() doesn't call releasepage in writeback mode.
- * We should care about the order of writing to the same file
- * by multiple move extent processes.
- * It needs to call wait_on_page_writeback() to wait for the
- * writeback of the page.
+ * If orig extent was uninitialized it can become initialized
+ * at any time after i_data_sem was dropped, in order to
+ * serialize with delalloc we have recheck extent while we
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
*/
- wait_on_page_writeback(page);
+ if (uninit) {
+ double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+ * fallback to data copying */
+ uninit = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
- /* Release old bh and drop refs */
- try_to_release_page(page, 0);
+ uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
+ if (*err)
+ goto drop_data_sem;
+
+ if (!uninit) {
+ double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+ if ((page_has_private(pagep[0]) &&
+ !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) &&
+ !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+ drop_data_sem:
+ double_up_write_data_sem(orig_inode, donor_inode);
+ goto unlock_pages;
+ }
+data_copy:
+ *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ if (*err)
+ goto unlock_pages;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+ if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+ (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ *err = -EBUSY;
+ goto unlock_pages;
+ }
replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page,
- &err2);
- if (err2) {
+ orig_blk_offset,
+ block_len_in_page, err);
+ if (*err) {
if (replaced_count) {
block_len_in_page = replaced_count;
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
- goto out;
+ goto unlock_pages;
}
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+ *err = __block_write_begin(pagep[0], from, from + replaced_size,
+ ext4_get_block);
+ if (!*err)
+ *err = block_commit_write(pagep[0], from, from + replaced_size);
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
-
- bh = page_buffers(page);
- for (i = 0; i < data_offset_in_page; i++)
- bh = bh->b_this_page;
-
- for (i = 0; i < block_len_in_page; i++) {
- *err = ext4_get_block(orig_inode,
- (sector_t)(orig_blk_offset + i), bh, 0);
- if (*err < 0)
- goto out;
-
- if (bh->b_this_page != NULL)
- bh = bh->b_this_page;
- }
-
- *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
- page, fsdata);
- page = NULL;
-
-out:
- if (unlikely(page)) {
- if (PageLocked(page))
- unlock_page(page);
- page_cache_release(page);
- ext4_journal_stop(handle);
- }
-out2:
+ if (unlikely(*err < 0))
+ goto repair_branches;
+
+ /* Even in case of data=writeback it is reasonable to pin
+ * inode to transaction, to prevent unexpected data loss */
+ *err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+ unlock_page(pagep[0]);
+ page_cache_release(pagep[0]);
+ unlock_page(pagep[1]);
+ page_cache_release(pagep[1]);
+stop_journal:
ext4_journal_stop(handle);
-
- if (err2)
- *err = err2;
-
+ /* Buffer was busy because probably is pinned to journal transaction,
+ * force transaction commit may help to free it. */
+ if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+ &retries))
+ goto again;
return replaced_count;
+
+repair_branches:
+ /*
+ * This should never ever happen!
+ * Extents are swapped already, but we are not able to copy data.
+ * Try to swap extents to it's original places
+ */
+ double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+ orig_blk_offset,
+ block_len_in_page, &err2);
+ double_up_write_data_sem(orig_inode, donor_inode);
+ if (replaced_count != block_len_in_page) {
+ EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+ "Unable to copy data block,"
+ " data will be lost.");
+ *err = -EIO;
+ }
+ replaced_count = 0;
+ goto unlock_pages;
}
/**
@@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* Files should be in the same ext4 FS */
- if (orig_inode->i_sb != donor_inode->i_sb) {
- ext4_debug("ext4 move extent: The argument files "
- "should be in same FS [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode,
}
if ((orig_start >= EXT_MAX_BLOCKS) ||
- (donor_start >= EXT_MAX_BLOCKS) ||
(*len > EXT_MAX_BLOCKS) ||
(orig_start + *len >= EXT_MAX_BLOCKS)) {
ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
@@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ * Lock two inodes' i_mutex
*/
-static int
+static void
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1 == inode2) {
- mutex_lock(&inode1->i_mutex);
- goto out;
- }
-
- if (inode1->i_ino < inode2->i_ino) {
+ BUG_ON(inode1 == inode2);
+ if (inode1 < inode2) {
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
} else {
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
-
-out:
- return ret;
}
/**
@@ -1109,28 +1223,13 @@ out:
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static int
+static void
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
- int ret = 0;
-
- BUG_ON(inode1 == NULL && inode2 == NULL);
-
- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
- if (ret < 0)
- goto out;
-
- if (inode1)
- mutex_unlock(&inode1->i_mutex);
-
- if (inode2 && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
-
-out:
- return ret;
+ mutex_unlock(&inode1->i_mutex);
+ mutex_unlock(&inode2->i_mutex);
}
/**
@@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret1, ret2, depth, last_extent = 0;
+ int ret, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different inodes */
+ if (orig_inode == donor_inode) {
ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
+ "be same inode [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
@@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
-
+ /* TODO: This is non obvious task to swap blocks for inodes with full
+ jornaling enabled */
+ if (ext4_should_journal_data(orig_inode) ||
+ ext4_should_journal_data(donor_inode)) {
+ return -EINVAL;
+ }
/* Protect orig and donor inodes against a truncate */
- ret1 = mext_inode_double_lock(orig_inode, donor_inode);
- if (ret1 < 0)
- return ret1;
+ mext_inode_double_lock(orig_inode, donor_inode);
+
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
- if (ret1)
+ if (ret)
goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (file_end < block_end)
len -= block_end - file_end;
- ret1 = get_ext_path(orig_inode, block_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret)
goto out;
/* Get path structure to check the hole */
- ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret)
goto out;
depth = ext_depth(orig_inode);
@@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
goto out;
}
seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret1 = -EINVAL;
+ ret = -EINVAL;
goto out;
}
@@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret1 = last_extent;
+ ret = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit,
- &ret1);
+ &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- if (ret1 < 0)
+ if (ret < 0)
break;
if (*moved_len > len) {
EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
- ret1 = -EIO;
+ ret = -EIO;
break;
}
@@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
double_down_write_data_sem(orig_inode, donor_inode);
- if (ret1 < 0)
+ if (ret < 0)
break;
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
- if (ret1)
+ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1527,9 @@ out:
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
- ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-
- if (ret1)
- return ret1;
- else if (ret2)
- return ret2;
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
+ mext_inode_double_unlock(orig_inode, donor_inode);
- return 0;
+ return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a42cc04466..6d600a69fc9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
{
struct buffer_head *bh;
+ if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+ ((inode->i_size >> 10) >=
+ EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
+ *err = -ENOSPC;
+ return NULL;
+ }
+
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
bh = ext4_bread(handle, inode, *block, 1, err);
@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
bh = NULL;
}
}
+ if (!bh && !(*err)) {
+ *err = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
return bh;
}
@@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
+ if (*err == 0)
+ *err = ERR_BAD_DX_DIR;
goto fail;
+ }
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
- if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
+ if (!(*err))
+ *err = ERR_BAD_DX_DIR;
goto fail2;
+ }
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (!buffer_verified(bh) &&
@@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
*/
while (num_frames--) {
if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
+ 0, &err))) {
+ if (!err) {
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ return -EIO;
+ }
return err; /* Failure */
+ }
if (!buffer_verified(bh) &&
!ext4_dx_csum_verify(dir,
@@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
- int err, count = 0;
+ int err = 0, count = 0;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+ if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
return err;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
- if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+ if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
+ if (!(*err)) {
+ *err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
goto errout;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
@@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
- bh = ext4_bread(handle, dir, block, 0, &retval);
- if(!bh)
+ if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
+ if (!retval) {
+ retval = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
return retval;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data))
@@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
entries = frame->entries;
at = frame->at;
- if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(dir->i_sb,
+ "Directory hole detected on inode %lu\n",
+ dir->i_ino);
+ }
goto cleanup;
+ }
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -2149,9 +2202,7 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
-#endif
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
@@ -2199,9 +2250,15 @@ retry:
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread(handle, inode, 0, 1, &err);
- if (!dir_block)
+ if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
goto out_clear_inode;
+ }
BUFFER_TRACE(dir_block, "get_write_access");
err = ext4_journal_get_write_access(handle, dir_block);
if (err)
@@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode,
"error %d reading directory "
"lblock %u", err, lblock);
+ else
+ ext4_warning(inode->i_sb,
+ "bad directory (dir #%lu) - no data block",
+ inode->i_ino);
+
offset += sb->s_blocksize;
continue;
}
@@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0, rc;
- if (!ext4_handle_valid(handle))
+ if (!EXT4_SB(sb)->s_journal)
return 0;
mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
@@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- /* ext4_handle_valid() assumes a valid handle_t pointer */
- if (handle && !ext4_handle_valid(handle))
+ if (!EXT4_SB(inode->i_sb)->s_journal)
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (sbi->s_journal && !handle)
+ if (!handle)
goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
- dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
- if (!dir_bh)
+ if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
+ if (!retval) {
+ retval = -EIO;
+ ext4_error(old_inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ old_inode->i_ino);
+ }
goto end_rename;
+ }
if (!buffer_verified(dir_bh) &&
!ext4_dirent_csum_verify(old_inode,
(struct ext4_dir_entry *)dir_bh->b_data))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef169a6..68e896e12a6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
+ BUG_ON(!list_empty(&io->list));
+ BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+
if (io->page)
put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
-
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(io->inode));
return ret;
}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+ ext4_debug("inode %lu completed_io list is empty\n",
+ inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io->flag & EXT4_IO_END_IN_FSYNC)
- goto requeue;
- if (list_empty(&io->list)) {
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- goto free;
+ if (list_empty(&ei->i_completed_io_list)) {
+ io_end->flag |= EXT4_IO_END_QUEUED;
+ queue_work(wq, &io_end->work);
}
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
- if (!mutex_trylock(&inode->i_mutex)) {
- bool was_queued;
-requeue:
- was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
- io->flag |= EXT4_IO_END_QUEUED;
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /*
- * Requeue the work instead of waiting so that the work
- * items queued after this can be processed.
- */
- queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
- /*
- * To prevent the ext4-dio-unwritten thread from keeping
- * requeueing end_io requests and occupying cpu for too long,
- * yield the cpu if it sees an end_io request that has already
- * been requeued.
- */
- if (was_queued)
- yield();
- return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ ext4_io_end_t *work_io)
+{
+ ext4_io_end_t *io;
+ struct list_head unwritten, complete, to_free;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ INIT_LIST_HEAD(&complete);
+ INIT_LIST_HEAD(&to_free);
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode);
+ list_replace_init(&ei->i_completed_io_list, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+
+ list_add_tail(&io->list, &complete);
+ }
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&complete)) {
+ io = list_entry(complete.next, ext4_io_end_t, list);
+ io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* end_io context can not be destroyed now because it still
+ * used by queued worker. Worker thread will destroy it later */
+ if (io->flag & EXT4_IO_END_QUEUED)
+ list_del_init(&io->list);
+ else
+ list_move(&io->list, &to_free);
+ }
+ /* If we are called from worker context, it is time to clear queued
+ * flag, and destroy it's end_io if it was converted already */
+ if (work_io) {
+ work_io->flag &= ~EXT4_IO_END_QUEUED;
+ if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+ list_add_tail(&work_io->list, &to_free);
}
- list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- (void) ext4_end_io_nolock(io);
- mutex_unlock(&inode->i_mutex);
-free:
- ext4_free_io_end(io);
+
+ while (!list_empty(&to_free)) {
+ io = list_entry(to_free.next, ext4_io_end_t, list);
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_unwritten_io(struct inode *inode)
+{
+ int ret;
+ WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+ !(inode->i_state & I_FREEING));
+ ret = ext4_do_flush_completed_IO(inode, NULL);
+ ext4_unwritten_wait(inode);
+ return ret;
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 41f6ef68e2e..7a75e108696 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb)
smp_mb__after_clear_bit();
}
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+ ext4_group_t group) {
+ return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+ ext4_group_t group) {
+ group = ext4_meta_bg_first_group(sb, group);
+ return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+ ext4_group_t group) {
+ ext4_grpblk_t overhead;
+ overhead = ext4_bg_num_gdb(sb, group);
+ if (ext4_bg_has_super(sb, group))
+ overhead += 1 +
+ le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ return overhead;
+}
+
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_bg_has_super(sb, group) ?
- (1 + ext4_bg_num_gdb(sb, group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ unsigned overhead = ext4_group_overhead_blocks(sb, group);
ext4_fsblk_t metaend = start + overhead;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
@@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
* be a partial of a flex group.
*
* @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
*/
-static void ext4_alloc_group_tables(struct super_block *sb,
+static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
ext4_fsblk_t start_blk;
ext4_fsblk_t last_blk;
ext4_group_t src_group;
@@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb,
(last_group & ~(flexbg_size - 1))));
next_group:
group = group_data[0].group;
+ if (src_group >= group_data[0].group + flex_gd->count)
+ return -ENOSPC;
start_blk = ext4_group_first_block_no(sb, src_group);
last_blk = start_blk + group_data[src_group - group].blocks_count;
- overhead = ext4_bg_has_super(sb, src_group) ?
- (1 + ext4_bg_num_gdb(sb, src_group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ overhead = ext4_group_overhead_blocks(sb, src_group);
start_blk += overhead;
- BUG_ON(src_group >= group_data[0].group + flex_gd->count);
/* We collect contiguous blocks as much as possible. */
src_group++;
- for (; src_group <= last_group; src_group++)
- if (!ext4_bg_has_super(sb, src_group))
+ for (; src_group <= last_group; src_group++) {
+ overhead = ext4_group_overhead_blocks(sb, src_group);
+ if (overhead != 0)
last_blk += group_data[src_group - group].blocks_count;
else
break;
+ }
/* Allocate block bitmaps */
for (; bb_index < flex_gd->count; bb_index++) {
@@ -300,6 +323,7 @@ next_group:
group_data[i].free_blocks_count);
}
}
+ return 0;
}
static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
ext4_group_t group, count;
struct buffer_head *bh = NULL;
int reserved_gdb, i, j, err = 0, err2;
+ int meta_bg;
BUG_ON(!flex_gd->count || !group_data ||
group_data[0].group != sbi->s_groups_count);
reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
@@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
group = group_data[0].group;
for (i = 0; i < flex_gd->count; i++, group++) {
unsigned long gdblocks;
+ ext4_grpblk_t overhead;
gdblocks = ext4_bg_num_gdb(sb, group);
start = ext4_group_first_block_no(sb, group);
+ if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+ goto handle_itb;
+
+ if (meta_bg == 1) {
+ ext4_group_t first_group;
+ first_group = ext4_meta_bg_first_group(sb, group);
+ if (first_group != group + 1 &&
+ first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+ goto handle_itb;
+ }
+
+ block = start + ext4_bg_has_super(sb, group);
/* Copy all of the GDT blocks into the backup in this group */
- for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+ for (j = 0; j < gdblocks; j++, block++) {
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
@@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
}
+handle_itb:
/* Initialize group tables of the grop @group */
if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
goto handle_bb;
@@ -521,11 +561,11 @@ handle_bb:
err = PTR_ERR(bh);
goto out;
}
- if (ext4_bg_has_super(sb, group)) {
+ overhead = ext4_group_overhead_blocks(sb, group);
+ if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
- 1);
+ ext4_set_bits(bh->b_data, 0, overhead);
}
ext4_mark_bitmap_end(group_data[i].blocks_count,
sb->s_blocksize * 8, bh->b_data);
@@ -822,6 +862,45 @@ exit_bh:
}
/*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+ handle_t *handle, ext4_group_t group) {
+ ext4_fsblk_t gdblock;
+ struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int err;
+
+ gdblock = ext4_meta_bg_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group);
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
+ return -EIO;
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
+ return err;
+ }
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = gdb_bh;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ ext4_kvfree(o_group_desc);
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ brelse(gdb_bh);
+ return err;
+}
+
+/*
* Called when we are adding a new group which has a backup copy of each of
* the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
* We need to add these reserved backup GDT blocks to the resize inode, so
@@ -949,16 +1028,16 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb,
- int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+ int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- const ext4_group_t last = sbi->s_groups_count;
+ ext4_group_t last;
const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
unsigned three = 1;
unsigned five = 5;
unsigned seven = 7;
- ext4_group_t group;
+ ext4_group_t group = 0;
int rest = sb->s_blocksize - size;
handle_t *handle;
int err = 0, err2;
@@ -970,10 +1049,17 @@ static void update_backups(struct super_block *sb,
goto exit_err;
}
- ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+ if (meta_bg == 0) {
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ last = sbi->s_groups_count;
+ } else {
+ group = ext4_meta_bg_first_group(sb, group) + 1;
+ last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+ }
- while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ while (group < sbi->s_groups_count) {
struct buffer_head *bh;
+ ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
if (ext4_handle_valid(handle) &&
@@ -982,13 +1068,20 @@ static void update_backups(struct super_block *sb,
(err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
break;
- bh = sb_getblk(sb, group * bpg + blk_off);
+ if (meta_bg == 0)
+ backup_block = group * bpg + blk_off;
+ else
+ backup_block = (ext4_group_first_block_no(sb, group) +
+ ext4_bg_has_super(sb, group));
+
+ bh = sb_getblk(sb, backup_block);
if (!bh) {
err = -EIO;
break;
}
- ext4_debug("update metadata backup %#04lx\n",
- (unsigned long)bh->b_blocknr);
+ ext4_debug("update metadata backup %llu(+%llu)\n",
+ backup_block, backup_block -
+ ext4_group_first_block_no(sb, group));
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -1001,6 +1094,13 @@ static void update_backups(struct super_block *sb,
if (unlikely(err))
ext4_std_error(sb, err);
brelse(bh);
+
+ if (meta_bg == 0)
+ group = ext4_list_backups(sb, &three, &five, &seven);
+ else if (group == last)
+ break;
+ else
+ group = last;
}
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
@@ -1043,7 +1143,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
struct ext4_super_block *es = sbi->s_es;
struct buffer_head *gdb_bh;
int i, gdb_off, gdb_num, err = 0;
+ int meta_bg;
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
for (i = 0; i < count; i++, group++) {
int reserved_gdb = ext4_bg_has_super(sb, group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1063,8 +1165,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
err = reserve_backup_gdb(handle, resize_inode, group);
- } else
+ } else if (meta_bg != 0) {
+ err = add_new_gdb_meta_bg(sb, handle, group);
+ } else {
err = add_new_gdb(handle, resize_inode, group);
+ }
if (err)
break;
}
@@ -1076,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
struct buffer_head *bh = sb_getblk(sb, block);
if (!bh)
return NULL;
-
- if (bitmap_uptodate(bh))
- return bh;
-
- lock_buffer(bh);
- if (bh_submit_read(bh) < 0) {
- unlock_buffer(bh);
- brelse(bh);
- return NULL;
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ brelse(bh);
+ return NULL;
+ }
}
- unlock_buffer(bh);
return bh;
}
@@ -1161,6 +1261,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_free_group_clusters_set(sb, gdp,
EXT4_B2C(sbi, group_data->free_blocks_count));
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ if (ext4_has_group_desc_csum(sb))
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb));
gdp->bg_flags = cpu_to_le16(*bg_flags);
ext4_group_desc_csum_set(sb, group, gdp);
@@ -1216,7 +1319,7 @@ static void ext4_update_super(struct super_block *sb,
}
reserved_blocks = ext4_r_blocks_count(es) * 100;
- do_div(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
reserved_blocks *= blocks_count;
do_div(reserved_blocks, 100);
@@ -1227,6 +1330,7 @@ static void ext4_update_super(struct super_block *sb,
le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
flex_gd->count);
+ ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
/*
* We need to protect s_groups_count against other CPUs seeing
* inconsistent state in the superblock.
@@ -1261,6 +1365,8 @@ static void ext4_update_super(struct super_block *sb,
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+ ext4_debug("free blocks count %llu",
+ percpu_counter_read(&sbi->s_freeclusters_counter));
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
sbi->s_log_groups_per_flex) {
@@ -1349,16 +1455,24 @@ exit_journal:
err = err2;
if (!err) {
- int i;
+ int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ int gdb_num_end = ((group + flex_gd->count - 1) /
+ EXT4_DESC_PER_BLOCK(sb));
+ int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG);
+ sector_t old_gdb = 0;
+
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
- for (i = 0; i < flex_gd->count; i++, group++) {
+ sizeof(struct ext4_super_block), 0);
+ for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- int gdb_num;
- gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+
gdb_bh = sbi->s_group_desc[gdb_num];
+ if (old_gdb == gdb_bh->b_blocknr)
+ continue;
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
- gdb_bh->b_size);
+ gdb_bh->b_size, meta_bg);
+ old_gdb = gdb_bh->b_blocknr;
}
}
exit:
@@ -1402,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
group_data[i].group = group + i;
group_data[i].blocks_count = blocks_per_group;
- overhead = ext4_bg_has_super(sb, group + i) ?
- (1 + ext4_bg_num_gdb(sb, group + i) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ overhead = ext4_group_overhead_blocks(sb, group + i);
group_data[i].free_blocks_count = blocks_per_group - overhead;
if (ext4_has_group_desc_csum(sb))
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
@@ -1492,6 +1604,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (err)
goto out;
+ err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+ if (err)
+ goto out;
+
flex_gd.count = 1;
flex_gd.groups = input;
flex_gd.bg_flags = &bg_flags;
@@ -1544,11 +1664,13 @@ errout:
err = err2;
if (!err) {
+ ext4_fsblk_t first_block;
+ first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ (char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
}
@@ -1631,6 +1753,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
return err;
} /* ext4_group_extend */
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+ return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t nr;
+ int i, ret, err = 0;
+ int credits = 1;
+
+ ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+ if (inode) {
+ if (es->s_reserved_gdt_blocks) {
+ ext4_error(sb, "Unexpected non-zero "
+ "s_reserved_gdt_blocks");
+ return -EPERM;
+ }
+
+ /* Do a quick sanity check of the resize inode */
+ if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ goto invalid_resize_inode;
+ for (i = 0; i < EXT4_N_BLOCKS; i++) {
+ if (i == EXT4_DIND_BLOCK) {
+ if (ei->i_data[i])
+ continue;
+ else
+ goto invalid_resize_inode;
+ }
+ if (ei->i_data[i])
+ goto invalid_resize_inode;
+ }
+ credits += 3; /* block bitmap, bg descriptor, resize inode */
+ }
+
+ handle = ext4_journal_start_sb(sb, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto errout;
+
+ EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ sbi->s_es->s_first_meta_bg =
+ cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+ err = ext4_handle_dirty_super(handle, sb);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto errout;
+ }
+
+ if (inode) {
+ nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ ei->i_data[EXT4_DIND_BLOCK] = 0;
+ inode->i_blocks = 0;
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ ext4_std_error(sb, err);
+ }
+
+errout:
+ ret = ext4_journal_stop(handle);
+ if (!err)
+ err = ret;
+ return ret;
+
+invalid_resize_inode:
+ ext4_error(sb, "corrupted/inconsistent resize inode");
+ return -EINVAL;
+}
+
/*
* ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
*
@@ -1643,21 +1853,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
struct buffer_head *bh;
- struct inode *resize_inode;
- ext4_fsblk_t o_blocks_count;
- ext4_group_t o_group;
- ext4_group_t n_group;
- ext4_grpblk_t offset, add;
+ struct inode *resize_inode = NULL;
+ ext4_grpblk_t add, offset;
unsigned long n_desc_blocks;
unsigned long o_desc_blocks;
- unsigned long desc_blocks;
- int err = 0, flexbg_size = 1;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_fsblk_t o_blocks_count;
+ ext4_fsblk_t n_blocks_count_retry = 0;
+ unsigned long last_update_time = 0;
+ int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int meta_bg;
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+retry:
o_blocks_count = ext4_blocks_count(es);
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
- "to %llu blocks", o_blocks_count, n_blocks_count);
+ ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
if (n_blocks_count < o_blocks_count) {
/* On-line shrinking not supported */
@@ -1672,32 +1892,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
- n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
- EXT4_DESC_PER_BLOCK(sb);
- o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- desc_blocks = n_desc_blocks - o_desc_blocks;
+ n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+ o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
- if (desc_blocks &&
- (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
- le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
- ext4_warning(sb, "No reserved GDT blocks, can't resize");
- return -EPERM;
- }
+ meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
- resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
- if (IS_ERR(resize_inode)) {
- ext4_warning(sb, "Error opening resize inode");
- return PTR_ERR(resize_inode);
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (meta_bg) {
+ ext4_error(sb, "resize_inode and meta_bg enabled "
+ "simultaneously");
+ return -EINVAL;
+ }
+ if (n_desc_blocks > o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ n_blocks_count_retry = n_blocks_count;
+ n_desc_blocks = o_desc_blocks +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+ n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+ n_group--; /* set to last group number */
+ }
+
+ if (!resize_inode)
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
}
- /* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
- ext4_warning(sb, "can't read last block, resize aborted");
- return -ENOSPC;
+ if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+ err = ext4_convert_meta_bg(sb, resize_inode);
+ if (err)
+ goto out;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
+ if (n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ goto retry;
+ }
}
- brelse(bh);
/* extend the last group */
if (n_group == o_group)
@@ -1710,12 +1947,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
goto out;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- es->s_log_groups_per_flex)
- flexbg_size = 1 << es->s_log_groups_per_flex;
+ if (ext4_blocks_count(es) == n_blocks_count)
+ goto out;
- o_blocks_count = ext4_blocks_count(es);
- if (o_blocks_count == n_blocks_count)
+ err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+ if (err)
+ return err;
+
+ err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+ if (err)
goto out;
flex_gd = alloc_flex_gd(flexbg_size);
@@ -1729,19 +1969,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ if (jiffies - last_update_time > HZ * 10) {
+ if (last_update_time)
+ ext4_msg(sb, KERN_INFO,
+ "resized to %llu blocks",
+ ext4_blocks_count(es));
+ last_update_time = jiffies;
+ }
+ if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+ break;
err = ext4_flex_group_add(sb, resize_inode, flex_gd);
if (unlikely(err))
break;
}
+ if (!err && n_blocks_count_retry) {
+ n_blocks_count = n_blocks_count_retry;
+ n_blocks_count_retry = 0;
+ free_flex_gd(flex_gd);
+ flex_gd = NULL;
+ goto retry;
+ }
+
out:
if (flex_gd)
free_flex_gd(flex_gd);
-
- iput(resize_inode);
- if (test_opt(sb, DEBUG))
- ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
- "upto %llu blocks", o_blocks_count, n_blocks_count);
+ if (resize_inode != NULL)
+ iput(resize_inode);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 69c55d4e462..7265a036747 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
*/
if (!es->s_error_count)
mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
- es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+ le32_add_cpu(&es->s_error_count, 1);
}
static void save_error_info(struct super_block *sb, const char *func,
@@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb)
flush_workqueue(sbi->dio_unwritten_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
- lock_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb)
* Now that we are completely done shutting down the
* superblock, we need to actually destroy the kobject.
*/
- unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
if (sbi->s_chksum_driver)
@@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
- atomic_set(&ei->i_aiodio_unwritten, 0);
+ atomic_set(&ei->i_unwritten, 0);
return &ei->vfs_inode;
}
@@ -1224,6 +1221,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+ Opt_max_dir_size_kb,
};
static const match_table_t tokens = {
@@ -1297,6 +1295,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
+ {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1477,6 +1476,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_err, 0, 0}
};
@@ -1592,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
if (!args->from)
arg = EXT4_DEF_LI_WAIT_MULT;
sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_max_dir_size_kb) {
+ sbi->s_max_dir_size_kb = arg;
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (m->flags & MOPT_DATAJ) {
@@ -1664,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb,
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
if (handle_mount_opt(sb, p, token, args, journal_devnum,
journal_ioprio, is_remount) < 0)
@@ -1740,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
- static const struct match_token *t;
+ const struct match_token *t;
for (t = tokens; t->token != Opt_err; t++)
if (t->token == token && !strchr(t->pattern, '='))
@@ -1823,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+ if (nodefs || sbi->s_max_dir_size_kb)
+ SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
ext4_show_quota_options(seq, sb);
return 0;
@@ -1914,15 +1918,45 @@ done:
return res;
}
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups *new_groups;
+ int size;
+
+ if (!sbi->s_log_groups_per_flex)
+ return 0;
+
+ size = ext4_flex_group(sbi, ngroup - 1) + 1;
+ if (size <= sbi->s_flex_groups_allocated)
+ return 0;
+
+ size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+ new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groups) {
+ ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+ size / (int) sizeof(struct flex_groups));
+ return -ENOMEM;
+ }
+
+ if (sbi->s_flex_groups) {
+ memcpy(new_groups, sbi->s_flex_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups)));
+ ext4_kvfree(sbi->s_flex_groups);
+ }
+ sbi->s_flex_groups = new_groups;
+ sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ return 0;
+}
+
static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
- ext4_group_t flex_group_count;
ext4_group_t flex_group;
unsigned int groups_per_flex = 0;
- size_t size;
- int i;
+ int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
@@ -1931,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
}
groups_per_flex = 1 << sbi->s_log_groups_per_flex;
- /* We allocate both existing and potentially added groups */
- flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
- ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
- EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
- size = flex_group_count * sizeof(struct flex_groups);
- sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
- if (sbi->s_flex_groups == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
- flex_group_count);
+ err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+ if (err)
goto failed;
- }
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
@@ -2144,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- if (es->s_last_orphan)
+ /* don't clear list on RO mount w/ errors */
+ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
- es->s_last_orphan = 0;
+ es->s_last_orphan = 0;
+ }
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -2528,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
static struct attribute *ext4_attrs[] = {
@@ -2543,6 +2572,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
NULL,
};
@@ -2550,10 +2580,12 @@ static struct attribute *ext4_attrs[] = {
/* Features this copy of ext4 supports */
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
NULL,
};
@@ -3374,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
- if (!IS_EXT3_SB(sb) &&
+ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
@@ -3743,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
+ sbi->s_extent_max_zeroout_kb = 32;
/*
* set up enough so that it can read an inode
@@ -4519,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
- lock_super(sb);
/* Reset the needs_recovery flag before the fs is unlocked. */
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
- unlock_super(sb);
return 0;
}
@@ -4559,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
char *orig_data = kstrdup(data, GFP_KERNEL);
/* Store the original options */
- lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
old_opts.s_mount_opt2 = sbi->s_mount_opt2;
@@ -4701,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal == NULL)
ext4_commit_super(sb, 1);
- unlock_super(sb);
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
@@ -4714,10 +4743,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_QUOTA)) {
err = ext4_enable_quotas(sb);
- if (err) {
- lock_super(sb);
+ if (err)
goto restore_opts;
- }
}
}
#endif
@@ -4744,7 +4771,6 @@ restore_opts:
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
- unlock_super(sb);
kfree(orig_data);
return err;
}
@@ -5269,8 +5295,10 @@ static int __init ext4_init_fs(void)
if (err)
goto out6;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset)
+ if (!ext4_kset) {
+ err = -ENOMEM;
goto out5;
+ }
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = ext4_init_feat_adverts();
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index e06190322c1..964b634f666 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -6,6 +6,6 @@ obj-$(CONFIG_FAT_FS) += fat.o
obj-$(CONFIG_VFAT_FS) += vfat.o
obj-$(CONFIG_MSDOS_FS) += msdos.o
-fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o
vfat-y := namei_vfat.o
msdos-y := namei_msdos.o
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 1cc7038e273..91ad9e1c944 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -190,7 +190,8 @@ static void __fat_cache_inval_inode(struct inode *inode)
struct fat_cache *cache;
while (!list_empty(&i->cache_lru)) {
- cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list);
+ cache = list_entry(i->cache_lru.next,
+ struct fat_cache, cache_list);
list_del_init(&cache->cache_list);
i->nr_caches--;
fat_cache_free(cache);
@@ -261,9 +262,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
if (nr < 0)
goto out;
else if (nr == FAT_ENT_FREE) {
- fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
- " (i_pos %lld)", __func__,
- MSDOS_I(inode)->i_pos);
+ fat_fs_error_ratelimit(sb,
+ "%s: invalid cluster chain (i_pos %lld)",
+ __func__,
+ MSDOS_I(inode)->i_pos);
nr = -EIO;
goto out;
} else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index dc49ed2cbff..bca6d0a1255 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -18,7 +18,7 @@
#include <linux/time.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/kernel.h>
#include "fat.h"
@@ -123,7 +123,8 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
{
/* Fast stuff first */
if (*bh && *de &&
- (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
+ (*de - (struct msdos_dir_entry *)(*bh)->b_data) <
+ MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
*pos += sizeof(struct msdos_dir_entry);
(*de)++;
return 0;
@@ -155,7 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
ec = *ip++;
- if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+ charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE);
+ if (charlen > 0) {
op += charlen;
len -= charlen;
} else {
@@ -172,12 +174,12 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
}
if (unlikely(*ip)) {
- fat_msg(sb, KERN_WARNING, "filename was truncated while "
- "converting.");
+ fat_msg(sb, KERN_WARNING,
+ "filename was truncated while converting.");
}
*op = 0;
- return (op - ascii);
+ return op - ascii;
}
static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
@@ -205,7 +207,8 @@ fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
}
static inline int
-fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
+fat_short2lower_uni(struct nls_table *t, unsigned char *c,
+ int clen, wchar_t *uni)
{
int charlen;
wchar_t wc;
@@ -220,7 +223,8 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *un
if (!nc)
nc = *c;
- if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) {
+ charlen = t->char2uni(&nc, 1, uni);
+ if (charlen < 0) {
*uni = 0x003f; /* a question mark */
charlen = 1;
}
@@ -537,7 +541,6 @@ end_of_dir:
return err;
}
-
EXPORT_SYMBOL_GPL(fat_search_long);
struct fat_ioctl_filldir_callback {
@@ -574,7 +577,8 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
/* Fake . and .. for the root directory. */
if (inode->i_ino == MSDOS_ROOT_INO) {
while (cpos < 2) {
- if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0)
+ if (filldir(dirent, "..", cpos+1, cpos,
+ MSDOS_ROOT_INO, DT_DIR) < 0)
goto out;
cpos++;
filp->f_pos++;
@@ -872,25 +876,26 @@ static int fat_get_short_entry(struct inode *dir, loff_t *pos,
}
/*
- * The ".." entry can not provide the "struct fat_slot_info" informations
- * for inode. So, this function provide the some informations only.
+ * The ".." entry can not provide the "struct fat_slot_info" information
+ * for inode, nor a usable i_pos. So, this function provides some information
+ * only.
+ *
+ * Since this function walks through the on-disk inodes within a directory,
+ * callers are responsible for taking any locks necessary to prevent the
+ * directory from changing.
*/
int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
- struct msdos_dir_entry **de, loff_t *i_pos)
+ struct msdos_dir_entry **de)
{
- loff_t offset;
+ loff_t offset = 0;
- offset = 0;
- *bh = NULL;
+ *de = NULL;
while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
- if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) {
- *i_pos = fat_make_i_pos(dir->i_sb, *bh, *de);
+ if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
return 0;
- }
}
return -ENOENT;
}
-
EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
/* See if directory is empty */
@@ -913,7 +918,6 @@ int fat_dir_empty(struct inode *dir)
brelse(bh);
return result;
}
-
EXPORT_SYMBOL_GPL(fat_dir_empty);
/*
@@ -959,7 +963,6 @@ int fat_scan(struct inode *dir, const unsigned char *name,
}
return -ENOENT;
}
-
EXPORT_SYMBOL_GPL(fat_scan);
static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
@@ -1047,7 +1050,6 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
return 0;
}
-
EXPORT_SYMBOL_GPL(fat_remove_entries);
static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
@@ -1141,10 +1143,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
de[0].ctime_cs = de[1].ctime_cs = 0;
de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
}
- de[0].start = cpu_to_le16(cluster);
- de[0].starthi = cpu_to_le16(cluster >> 16);
- de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart);
- de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16);
+ fat_set_start(&de[0], cluster);
+ fat_set_start(&de[1], MSDOS_I(dir)->i_logstart);
de[0].size = de[1].size = 0;
memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
set_buffer_uptodate(bhs[0]);
@@ -1161,7 +1161,6 @@ error_free:
error:
return err;
}
-
EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
@@ -1377,5 +1376,4 @@ error_remove:
__fat_remove_entries(dir, pos, free_slots);
return err;
}
-
EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7d8e0dcac5d..ca7e8f8bad7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -5,6 +5,7 @@
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/fs.h>
+#include <linux/hash.h>
#include <linux/mutex.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>
@@ -27,26 +28,27 @@ struct fat_mount_options {
kgid_t fs_gid;
unsigned short fs_fmask;
unsigned short fs_dmask;
- unsigned short codepage; /* Codepage for shortname conversions */
- char *iocharset; /* Charset used for filename input/display */
- unsigned short shortname; /* flags for shortname display/create rule */
- unsigned char name_check; /* r = relaxed, n = normal, s = strict */
- unsigned char errors; /* On error: continue, panic, remount-ro */
+ unsigned short codepage; /* Codepage for shortname conversions */
+ char *iocharset; /* Charset used for filename input/display */
+ unsigned short shortname; /* flags for shortname display/create rule */
+ unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+ unsigned char errors; /* On error: continue, panic, remount-ro */
unsigned short allow_utime;/* permission for setting the [am]time */
- unsigned quiet:1, /* set = fake successful chmods and chowns */
- showexec:1, /* set = only set x bit for com/exe/bat */
- sys_immutable:1, /* set = system files are immutable */
- dotsOK:1, /* set = hidden and system files are named '.filename' */
- isvfat:1, /* 0=no vfat long filename support, 1=vfat support */
- utf8:1, /* Use of UTF-8 character set (Default) */
- unicode_xlate:1, /* create escape sequences for unhandled Unicode */
- numtail:1, /* Does first alias have a numeric '~1' type tail? */
- flush:1, /* write things quickly */
- nocase:1, /* Does this need case conversion? 0=need case conversion*/
- usefree:1, /* Use free_clusters for FAT32 */
- tz_utc:1, /* Filesystem timestamps are in UTC */
- rodir:1, /* allow ATTR_RO for directory */
- discard:1; /* Issue discard requests on deletions */
+ unsigned quiet:1, /* set = fake successful chmods and chowns */
+ showexec:1, /* set = only set x bit for com/exe/bat */
+ sys_immutable:1, /* set = system files are immutable */
+ dotsOK:1, /* set = hidden and system files are named '.filename' */
+ isvfat:1, /* 0=no vfat long filename support, 1=vfat support */
+ utf8:1, /* Use of UTF-8 character set (Default) */
+ unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+ numtail:1, /* Does first alias have a numeric '~1' type tail? */
+ flush:1, /* write things quickly */
+ nocase:1, /* Does this need case conversion? 0=need case conversion*/
+ usefree:1, /* Use free_clusters for FAT32 */
+ tz_utc:1, /* Filesystem timestamps are in UTC */
+ rodir:1, /* allow ATTR_RO for directory */
+ discard:1, /* Issue discard requests on deletions */
+ nfs:1; /* Do extra work needed for NFS export */
};
#define FAT_HASH_BITS 8
@@ -56,28 +58,28 @@ struct fat_mount_options {
* MS-DOS file system in-core superblock data
*/
struct msdos_sb_info {
- unsigned short sec_per_clus; /* sectors/cluster */
- unsigned short cluster_bits; /* log2(cluster_size) */
- unsigned int cluster_size; /* cluster size */
- unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+ unsigned short sec_per_clus; /* sectors/cluster */
+ unsigned short cluster_bits; /* log2(cluster_size) */
+ unsigned int cluster_size; /* cluster size */
+ unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */
unsigned short fat_start;
- unsigned long fat_length; /* FAT start & length (sec.) */
+ unsigned long fat_length; /* FAT start & length (sec.) */
unsigned long dir_start;
- unsigned short dir_entries; /* root dir start & entries */
- unsigned long data_start; /* first data sector */
- unsigned long max_cluster; /* maximum cluster number */
- unsigned long root_cluster; /* first cluster of the root directory */
- unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+ unsigned short dir_entries; /* root dir start & entries */
+ unsigned long data_start; /* first data sector */
+ unsigned long max_cluster; /* maximum cluster number */
+ unsigned long root_cluster; /* first cluster of the root directory */
+ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
struct mutex fat_lock;
- unsigned int prev_free; /* previously allocated cluster number */
- unsigned int free_clusters; /* -1 if undefined */
+ unsigned int prev_free; /* previously allocated cluster number */
+ unsigned int free_clusters; /* -1 if undefined */
unsigned int free_clus_valid; /* is free_clusters valid? */
struct fat_mount_options options;
- struct nls_table *nls_disk; /* Codepage used on disk */
- struct nls_table *nls_io; /* Charset used for input and display */
- const void *dir_ops; /* Opaque; default directory operations */
- int dir_per_block; /* dir entries per block */
- int dir_per_block_bits; /* log2(dir_per_block) */
+ struct nls_table *nls_disk; /* Codepage used on disk */
+ struct nls_table *nls_io; /* Charset used for input and display */
+ const void *dir_ops; /* Opaque; default directory operations */
+ int dir_per_block; /* dir entries per block */
+ int dir_per_block_bits; /* log2(dir_per_block) */
int fatent_shift;
struct fatent_operations *fatent_ops;
@@ -88,6 +90,9 @@ struct msdos_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+
+ spinlock_t dir_hash_lock;
+ struct hlist_head dir_hashtable[FAT_HASH_SIZE];
};
#define FAT_CACHE_VALID 0 /* special case for valid cache */
@@ -110,6 +115,7 @@ struct msdos_inode_info {
int i_attrs; /* unused attribute bits */
loff_t i_pos; /* on-disk position of directory entry or 0 */
struct hlist_node i_fat_hash; /* hash by i_location */
+ struct hlist_node i_dir_hash; /* hash by i_logstart */
struct rw_semaphore truncate_lock; /* protect bmap against truncate */
struct inode vfs_inode;
};
@@ -262,7 +268,7 @@ extern int fat_subdirs(struct inode *dir);
extern int fat_scan(struct inode *dir, const unsigned char *name,
struct fat_slot_info *sinfo);
extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
- struct msdos_dir_entry **de, loff_t *i_pos);
+ struct msdos_dir_entry **de);
extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct fat_slot_info *sinfo);
@@ -322,7 +328,7 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
extern const struct file_operations fat_file_operations;
extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
@@ -340,7 +346,12 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
int isvfat, void (*setup)(struct super_block *));
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
- struct inode *i2);
+ struct inode *i2);
+static inline unsigned long fat_dir_hash(int logstart)
+{
+ return hash_32(logstart, FAT_HASH_BITS);
+}
+
/* fat/misc.c */
extern __printf(3, 4) __cold
void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
@@ -366,6 +377,14 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
int fat_cache_init(void);
void fat_cache_destroy(void);
+/* fat/nfs.c */
+struct fid;
+extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type);
+extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type);
+extern struct dentry *fat_get_parent(struct dentry *child_dir);
+
/* helper for printk */
typedef unsigned long long llu;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 31f08ab62c5..260705c5806 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -186,9 +186,6 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
static void fat32_ent_put(struct fat_entry *fatent, int new)
{
- if (new == FAT_ENT_EOF)
- new = EOF_FAT32;
-
WARN_ON(new & 0xf0000000);
new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
*fatent->u.ent32_p = cpu_to_le32(new);
@@ -203,15 +200,18 @@ static int fat12_ent_next(struct fat_entry *fatent)
fatent->entry++;
if (fatent->nr_bhs == 1) {
- WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2)));
- WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+ WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data +
+ (bhs[0]->b_size - 2)));
+ WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data +
+ (bhs[0]->b_size - 1)));
if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
ent12_p[0] = nextp - 1;
ent12_p[1] = nextp;
return 1;
}
} else {
- WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+ WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data +
+ (bhs[0]->b_size - 1)));
WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
ent12_p[0] = nextp - 1;
ent12_p[1] = nextp;
@@ -631,7 +631,6 @@ error:
return err;
}
-
EXPORT_SYMBOL_GPL(fat_free_clusters);
/* 128kb is the whole sectors for FAT12 and FAT16 */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e5a6ac54eb..76f60c642c0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -281,15 +281,42 @@ static inline unsigned long fat_hash(loff_t i_pos)
return hash_32(i_pos, FAT_HASH_BITS);
}
+static void dir_hash_init(struct super_block *sb)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int i;
+
+ spin_lock_init(&sbi->dir_hash_lock);
+ for (i = 0; i < FAT_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&sbi->dir_hashtable[i]);
+}
+
void fat_attach(struct inode *inode, loff_t i_pos)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
- spin_lock(&sbi->inode_hash_lock);
- MSDOS_I(inode)->i_pos = i_pos;
- hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
- spin_unlock(&sbi->inode_hash_lock);
+ if (inode->i_ino != MSDOS_ROOT_INO) {
+ struct hlist_head *head = sbi->inode_hashtable
+ + fat_hash(i_pos);
+
+ spin_lock(&sbi->inode_hash_lock);
+ MSDOS_I(inode)->i_pos = i_pos;
+ hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
+ spin_unlock(&sbi->inode_hash_lock);
+ }
+
+ /* If NFS support is enabled, cache the mapping of start cluster
+ * to directory inode. This is used during reconnection of
+ * dentries to the filesystem root.
+ */
+ if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+ struct hlist_head *d_head = sbi->dir_hashtable;
+ d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart);
+
+ spin_lock(&sbi->dir_hash_lock);
+ hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head);
+ spin_unlock(&sbi->dir_hash_lock);
+ }
}
EXPORT_SYMBOL_GPL(fat_attach);
@@ -300,6 +327,12 @@ void fat_detach(struct inode *inode)
MSDOS_I(inode)->i_pos = 0;
hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
spin_unlock(&sbi->inode_hash_lock);
+
+ if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+ spin_lock(&sbi->dir_hash_lock);
+ hlist_del_init(&MSDOS_I(inode)->i_dir_hash);
+ spin_unlock(&sbi->dir_hash_lock);
+ }
}
EXPORT_SYMBOL_GPL(fat_detach);
@@ -504,6 +537,7 @@ static void init_once(void *foo)
ei->cache_valid_id = FAT_CACHE_VALID + 1;
INIT_LIST_HEAD(&ei->cache_lru);
INIT_HLIST_NODE(&ei->i_fat_hash);
+ INIT_HLIST_NODE(&ei->i_dir_hash);
inode_init_once(&ei->vfs_inode);
}
@@ -668,125 +702,9 @@ static const struct super_operations fat_sops = {
.show_options = fat_show_options,
};
-/*
- * a FAT file handle with fhtype 3 is
- * 0/ i_ino - for fast, reliable lookup if still in the cache
- * 1/ i_generation - to see if i_ino is still valid
- * bit 0 == 0 iff directory
- * 2/ i_pos(8-39) - if ino has changed, but still in cache
- * 3/ i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos
- * 4/ i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc
- *
- * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum
- * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits
- * of i_logstart is used to store the directory entry offset.
- */
-
-static struct dentry *fat_fh_to_dentry(struct super_block *sb,
- struct fid *fid, int fh_len, int fh_type)
-{
- struct inode *inode = NULL;
- u32 *fh = fid->raw;
-
- if (fh_len < 5 || fh_type != 3)
- return NULL;
-
- inode = ilookup(sb, fh[0]);
- if (!inode || inode->i_generation != fh[1]) {
- if (inode)
- iput(inode);
- inode = NULL;
- }
- if (!inode) {
- loff_t i_pos;
- int i_logstart = fh[3] & 0x0fffffff;
-
- i_pos = (loff_t)fh[2] << 8;
- i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28);
-
- /* try 2 - see if i_pos is in F-d-c
- * require i_logstart to be the same
- * Will fail if you truncate and then re-write
- */
-
- inode = fat_iget(sb, i_pos);
- if (inode && MSDOS_I(inode)->i_logstart != i_logstart) {
- iput(inode);
- inode = NULL;
- }
- }
-
- /*
- * For now, do nothing if the inode is not found.
- *
- * What we could do is:
- *
- * - follow the file starting at fh[4], and record the ".." entry,
- * and the name of the fh[2] entry.
- * - then follow the ".." file finding the next step up.
- *
- * This way we build a path to the root of the tree. If this works, we
- * lookup the path and so get this inode into the cache. Finally try
- * the fat_iget lookup again. If that fails, then we are totally out
- * of luck. But all that is for another day
- */
- return d_obtain_alias(inode);
-}
-
-static int
-fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
-{
- int len = *lenp;
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- loff_t i_pos;
-
- if (len < 5) {
- *lenp = 5;
- return 255; /* no room */
- }
-
- i_pos = fat_i_pos_read(sbi, inode);
- *lenp = 5;
- fh[0] = inode->i_ino;
- fh[1] = inode->i_generation;
- fh[2] = i_pos >> 8;
- fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart;
- fh[4] = (i_pos & 0x0f) << 28;
- if (parent)
- fh[4] |= MSDOS_I(parent)->i_logstart;
- return 3;
-}
-
-static struct dentry *fat_get_parent(struct dentry *child)
-{
- struct super_block *sb = child->d_sb;
- struct buffer_head *bh;
- struct msdos_dir_entry *de;
- loff_t i_pos;
- struct dentry *parent;
- struct inode *inode;
- int err;
-
- lock_super(sb);
-
- err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
- if (err) {
- parent = ERR_PTR(err);
- goto out;
- }
- inode = fat_build_inode(sb, de, i_pos);
- brelse(bh);
-
- parent = d_obtain_alias(inode);
-out:
- unlock_super(sb);
-
- return parent;
-}
-
static const struct export_operations fat_export_ops = {
- .encode_fh = fat_encode_fh,
.fh_to_dentry = fat_fh_to_dentry,
+ .fh_to_parent = fat_fh_to_parent,
.get_parent = fat_get_parent,
};
@@ -836,6 +754,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",usefree");
if (opts->quiet)
seq_puts(m, ",quiet");
+ if (opts->nfs)
+ seq_puts(m, ",nfs");
if (opts->showexec)
seq_puts(m, ",showexec");
if (opts->sys_immutable)
@@ -880,7 +800,7 @@ enum {
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
- Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
+ Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err,
};
static const match_table_t fat_tokens = {
@@ -909,6 +829,7 @@ static const match_table_t fat_tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_discard, "discard"},
+ {Opt_nfs, "nfs"},
{Opt_obsolete, "conv=binary"},
{Opt_obsolete, "conv=text"},
{Opt_obsolete, "conv=auto"},
@@ -989,6 +910,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
opts->numtail = 1;
opts->usefree = opts->nocase = 0;
opts->tz_utc = 0;
+ opts->nfs = 0;
opts->errors = FAT_ERRORS_RO;
*debug = 0;
@@ -1153,6 +1075,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_discard:
opts->discard = 1;
break;
+ case Opt_nfs:
+ opts->nfs = 1;
+ break;
/* obsolete mount options */
case Opt_obsolete:
@@ -1443,6 +1368,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* set up enough so that it can read an inode */
fat_hash_init(sb);
+ dir_hash_init(sb);
fat_ent_access_init(sb);
/*
@@ -1497,6 +1423,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
}
error = -ENOMEM;
insert_inode_hash(root_inode);
+ fat_attach(root_inode, 0);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
fat_msg(sb, KERN_ERR, "get root inode failed");
@@ -1536,18 +1463,14 @@ static int writeback_inode(struct inode *inode)
{
int ret;
- struct address_space *mapping = inode->i_mapping;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 0,
- };
- /* if we used WB_SYNC_ALL, sync_inode waits for the io for the
- * inode to finish. So WB_SYNC_NONE is sent down to sync_inode
+
+ /* if we used wait=1, sync_inode_metadata waits for the io for the
+ * inode to finish. So wait=0 is sent down to sync_inode_metadata
* and filemap_fdatawrite is used for the data blocks
*/
- ret = sync_inode(inode, &wbc);
+ ret = sync_inode_metadata(inode, 0);
if (!ret)
- ret = filemap_fdatawrite(mapping);
+ ret = filemap_fdatawrite(inode->i_mapping);
return ret;
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index b0e12bf9f4a..c1055e778ff 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -407,7 +407,7 @@ out:
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- struct super_block *sb= inode->i_sb;
+ struct super_block *sb = inode->i_sb;
struct fat_slot_info sinfo;
int err;
@@ -440,7 +440,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec ts;
- loff_t dotdot_i_pos, new_i_pos;
+ loff_t new_i_pos;
int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -456,8 +456,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
is_dir = S_ISDIR(old_inode->i_mode);
update_dotdot = (is_dir && old_dir != new_dir);
if (update_dotdot) {
- if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
- &dotdot_i_pos) < 0) {
+ if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
err = -EIO;
goto out;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6a6d8c0715a..e535dd75b98 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -914,7 +914,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec ts;
- loff_t dotdot_i_pos, new_i_pos;
+ loff_t new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
@@ -929,8 +929,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
is_dir = S_ISDIR(old_inode->i_mode);
update_dotdot = (is_dir && old_dir != new_dir);
if (update_dotdot) {
- if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
- &dotdot_i_pos) < 0) {
+ if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
err = -EIO;
goto out;
}
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
new file mode 100644
index 00000000000..ef4b5faba87
--- /dev/null
+++ b/fs/fat/nfs.c
@@ -0,0 +1,101 @@
+/* fs/fat/nfs.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/exportfs.h>
+#include "fat.h"
+
+/**
+ * Look up a directory inode given its starting cluster.
+ */
+static struct inode *fat_dget(struct super_block *sb, int i_logstart)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ struct hlist_head *head;
+ struct hlist_node *_p;
+ struct msdos_inode_info *i;
+ struct inode *inode = NULL;
+
+ head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
+ spin_lock(&sbi->dir_hash_lock);
+ hlist_for_each_entry(i, _p, head, i_dir_hash) {
+ BUG_ON(i->vfs_inode.i_sb != sb);
+ if (i->i_logstart != i_logstart)
+ continue;
+ inode = igrab(&i->vfs_inode);
+ if (inode)
+ break;
+ }
+ spin_unlock(&sbi->dir_hash_lock);
+ return inode;
+}
+
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct inode *inode;
+
+ if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
+ return NULL;
+
+ inode = ilookup(sb, ino);
+ if (inode && generation && (inode->i_generation != generation)) {
+ iput(inode);
+ inode = NULL;
+ }
+
+ return inode;
+}
+
+/**
+ * Map a NFS file handle to a corresponding dentry.
+ * The dentry may or may not be connected to the filesystem root.
+ */
+struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ fat_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a file specified by NFS handle.
+ * This requires that the handle contain the i_ino of the parent.
+ */
+struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ fat_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a directory that is not currently connected to
+ * the filesystem root.
+ *
+ * On entry, the caller holds child_dir->d_inode->i_mutex.
+ */
+struct dentry *fat_get_parent(struct dentry *child_dir)
+{
+ struct super_block *sb = child_dir->d_sb;
+ struct buffer_head *bh = NULL;
+ struct msdos_dir_entry *de;
+ struct inode *parent_inode = NULL;
+
+ if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
+ int parent_logstart = fat_get_start(MSDOS_SB(sb), de);
+ parent_inode = fat_dget(sb, parent_logstart);
+ }
+ brelse(bh);
+
+ return d_obtain_alias(parent_inode);
+}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8f704291d4e..71a600a19f0 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -258,7 +258,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = f_dupfd(arg, filp, 0);
break;
case F_DUPFD_CLOEXEC:
- err = f_dupfd(arg, filp, FD_CLOEXEC);
+ err = f_dupfd(arg, filp, O_CLOEXEC);
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6d46c0d7833..401b6c6248a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
{
return test_bit(BDI_writeback_running, &bdi->state);
}
+EXPORT_SYMBOL(writeback_in_progress);
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
@@ -438,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* setting I_SYNC flag and calling inode_sync_complete() to clear it.
*/
static int
-__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
@@ -526,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
inode->i_state |= I_SYNC;
spin_unlock(&inode->i_lock);
- ret = __writeback_single_inode(inode, wb, wbc);
+ ret = __writeback_single_inode(inode, wbc);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
@@ -669,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb,
* We use I_SYNC to pin the inode in memory. While it is set
* evict_inode() will wait so the inode cannot be freed.
*/
- __writeback_single_inode(inode, wb, &wbc);
+ __writeback_single_inode(inode, &wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index aba15f1b7ad..78d2837bc94 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
.close = fuse_vma_close,
.fault = filemap_fault,
.page_mkwrite = fuse_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 30e21997a1a..0def0504afc 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -492,6 +492,7 @@ out:
static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = gfs2_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
/**
@@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
return error;
}
vma->vm_ops = &gfs2_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 4bae4a4a60b..2d5b254ad9e 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -102,7 +102,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
return -1;
}
if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) {
- btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1);
+ le32_add_cpu(&btree->u.external[n].length, 1);
mark_buffer_dirty(bh);
brelse(bh);
return se;
@@ -153,7 +153,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
btree = &anode->btree;
}
btree->n_free_nodes--; n = btree->n_used_nodes++;
- btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12);
+ le16_add_cpu(&btree->first_free, 12);
btree->u.external[n].disk_secno = cpu_to_le32(se);
btree->u.external[n].file_secno = cpu_to_le32(fs);
btree->u.external[n].length = cpu_to_le32(1);
@@ -174,7 +174,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
}
if (btree->n_free_nodes) {
btree->n_free_nodes--; n = btree->n_used_nodes++;
- btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8);
+ le16_add_cpu(&btree->first_free, 8);
btree->u.internal[n].file_secno = cpu_to_le32(-1);
btree->u.internal[n].down = cpu_to_le32(na);
btree->u.internal[n-1].file_secno = cpu_to_le32(fs);
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 3228c524ebe..4364b2a02c5 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -145,10 +145,10 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
}
}
if (ptr) {
- d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4);
+ le32_add_cpu(&d->first_free, 4);
if (le32_to_cpu(d->first_free) > 2048) {
hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self));
- d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4);
+ le32_add_cpu(&d->first_free, -4);
return;
}
de->length = cpu_to_le16(36);
@@ -184,7 +184,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
de->not_8x3 = hpfs_is_name_long(name, namelen);
de->namelen = namelen;
memcpy(de->name, name, namelen);
- d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size);
+ le32_add_cpu(&d->first_free, d_size);
return de;
}
@@ -314,7 +314,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
de = de_next_de(de);
memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de);
- nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20));
+ le32_add_cpu(&nd->first_free, -((char *)de - (char *)nd - 20));
memcpy(d, nd, le32_to_cpu(nd->first_free));
for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos);
fix_up_ptrs(i->i_sb, ad);
@@ -474,8 +474,8 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to)
hpfs_brelse4(&qbh);
return 0;
}
- dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4);
- de->length = cpu_to_le16(le16_to_cpu(de->length) - 4);
+ le32_add_cpu(&dnode->first_free, -4);
+ le16_add_cpu(&de->length, -4);
de->down = 0;
hpfs_mark_4buffers_dirty(&qbh);
dno = up;
@@ -570,8 +570,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p);
if (!down) {
de->down = 0;
- de->length = cpu_to_le16(le16_to_cpu(de->length) - 4);
- dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4);
+ le16_add_cpu(&de->length, -4);
+ le32_add_cpu(&dnode->first_free, -4);
memmove(de_next_de(de), (char *)de_next_de(de) + 4,
(char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de));
} else {
@@ -647,14 +647,14 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
printk("HPFS: warning: goin'on\n");
}
- del->length = cpu_to_le16(le16_to_cpu(del->length) + 4);
+ le16_add_cpu(&del->length, 4);
del->down = 1;
- d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4);
+ le32_add_cpu(&d1->first_free, 4);
}
if (dlp && !down) {
- del->length = cpu_to_le16(le16_to_cpu(del->length) - 4);
+ le16_add_cpu(&del->length, -4);
del->down = 0;
- d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
+ le32_add_cpu(&d1->first_free, -4);
} else if (down)
*(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
} else goto endm;
@@ -668,9 +668,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length));
hpfs_delete_de(i->i_sb, dnode, de_prev);
if (!de_prev->down) {
- de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4);
+ le16_add_cpu(&de_prev->length, 4);
de_prev->down = 1;
- dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
+ le32_add_cpu(&dnode->first_free, 4);
}
*(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
hpfs_mark_4buffers_dirty(&qbh);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9460120a517..c5bc355d824 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* way when do_mmap_pgoff unwinds (may be important on powerpc
* and ia64).
*/
- vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+ vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &hugetlb_vm_ops;
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
+hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
- vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+ vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
unsigned long v_offset;
/*
* Can the expression below overflow on 32-bit arches?
- * No, because the prio_tree returns us only those vmas
+ * No, because the interval tree returns us only those vmas
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
mutex_lock(&mapping->i_mmap_mutex);
- if (!prio_tree_empty(&mapping->i_mmap))
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
mutex_unlock(&mapping->i_mmap_mutex);
truncate_hugepages(inode, offset);
diff --git a/fs/inode.c b/fs/inode.c
index ac8d904b3f1..b03c7195724 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping)
mutex_init(&mapping->i_mmap_mutex);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
- INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+ mapping->i_mmap = RB_ROOT;
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
}
EXPORT_SYMBOL(address_space_init_once);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 52c15c77602..86b39b167c2 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -86,7 +86,12 @@ nope:
static void release_data_buffer(struct buffer_head *bh)
{
if (buffer_freed(bh)) {
+ WARN_ON_ONCE(buffer_dirty(bh));
clear_buffer_freed(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
release_buffer_page(bh);
} else
put_bh(bh);
@@ -866,17 +871,35 @@ restart_loop:
* there's no point in keeping a checkpoint record for
* it. */
- /* A buffer which has been freed while still being
- * journaled by a previous transaction may end up still
- * being dirty here, but we want to avoid writing back
- * that buffer in the future after the "add to orphan"
- * operation been committed, That's not only a performance
- * gain, it also stops aliasing problems if the buffer is
- * left behind for writeback and gets reallocated for another
- * use in a different page. */
- if (buffer_freed(bh) && !jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ /*
+ * A buffer which has been freed while still being journaled by
+ * a previous transaction.
+ */
+ if (buffer_freed(bh)) {
+ /*
+ * If the running transaction is the one containing
+ * "add to orphan" operation (b_next_transaction !=
+ * NULL), we have to wait for that transaction to
+ * commit before we can really get rid of the buffer.
+ * So just clear b_modified to not confuse transaction
+ * credit accounting and refile the buffer to
+ * BJ_Forget of the running transaction. If the just
+ * committed transaction contains "add to orphan"
+ * operation, we can completely invalidate the buffer
+ * now. We are rather throughout in that since the
+ * buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial
+ * page.
+ */
+ jh->b_modified = 0;
+ if (!jh->b_next_transaction) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
+ }
}
if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index febc10db5ce..78b7f84241d 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1843,15 +1843,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+ int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
- int ret;
BUFFER_TRACE(bh, "entry");
+retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1879,10 +1880,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* clear the buffer dirty bit at latest at the moment when the
* transaction marking the buffer as freed in the filesystem
* structures is committed because from that moment on the
- * buffer can be reallocated and used by a different page.
+ * block can be reallocated and used by a different page.
* Since the block hasn't been freed yet but the inode has
* already been added to orphan list, it is safe for us to add
* the buffer to BJ_Forget list of the newest transaction.
+ *
+ * Also we have to clear buffer_mapped flag of a truncated buffer
+ * because the buffer_head may be attached to the page straddling
+ * i_size (can happen only when blocksize < pagesize) and thus the
+ * buffer_head can be reused when the file is extended again. So we end
+ * up keeping around invalidated buffers attached to transactions'
+ * BJ_Forget list just to stop checkpointing code from cleaning up
+ * the transaction this buffer was modified in.
*/
transaction = jh->b_transaction;
if (transaction == NULL) {
@@ -1909,13 +1918,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_running_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
@@ -1923,13 +1928,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
@@ -1950,10 +1951,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
/*
* The buffer is committing, we simply cannot touch
- * it. So we just set j_next_transaction to the
- * running transaction (if there is one) and mark
- * buffer as freed so that commit code knows it should
- * clear dirty bits when it is done with the buffer.
+ * it. If the page is straddling i_size we have to wait
+ * for commit and try again.
+ */
+ if (partial_page) {
+ tid_t tid = journal->j_committing_transaction->t_tid;
+
+ journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+ goto retry;
+ }
+ /*
+ * OK, buffer won't be reachable after truncate. We just set
+ * j_next_transaction to the running transaction (if there is
+ * one) and mark buffer as freed so that commit code knows it
+ * should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1976,6 +1991,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
zap_buffer:
+ /*
+ * This is tricky. Although the buffer is truncated, it may be reused
+ * if blocksize < pagesize and it is attached to the page straddling
+ * EOF. Since the buffer might have been added to BJ_Forget list of the
+ * running transaction, journal_get_write_access() won't clear
+ * b_modified and credit accounting gets confused. So clear b_modified
+ * here. */
+ jh->b_modified = 0;
journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
@@ -2024,7 +2047,8 @@ void journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh);
+ may_free &= journal_unmap_buffer(journal, bh,
+ offset > 0);
unlock_buffer(bh);
}
curr_off = next_off;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index af5280fb579..3091d42992f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1014,17 +1014,35 @@ restart_loop:
* there's no point in keeping a checkpoint record for
* it. */
- /* A buffer which has been freed while still being
- * journaled by a previous transaction may end up still
- * being dirty here, but we want to avoid writing back
- * that buffer in the future after the "add to orphan"
- * operation been committed, That's not only a performance
- * gain, it also stops aliasing problems if the buffer is
- * left behind for writeback and gets reallocated for another
- * use in a different page. */
- if (buffer_freed(bh) && !jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ /*
+ * A buffer which has been freed while still being journaled by
+ * a previous transaction.
+ */
+ if (buffer_freed(bh)) {
+ /*
+ * If the running transaction is the one containing
+ * "add to orphan" operation (b_next_transaction !=
+ * NULL), we have to wait for that transaction to
+ * commit before we can really get rid of the buffer.
+ * So just clear b_modified to not confuse transaction
+ * credit accounting and refile the buffer to
+ * BJ_Forget of the running transaction. If the just
+ * committed transaction contains "add to orphan"
+ * operation, we can completely invalidate the buffer
+ * now. We are rather through in that since the
+ * buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial
+ * page.
+ */
+ jh->b_modified = 0;
+ if (!jh->b_next_transaction) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
+ }
}
if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e149b99a7ff..484b8d1c6cb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal)
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
read_lock(&journal->j_state_lock);
+ /* Is it already empty? */
+ if (sb->s_start == 0) {
+ read_unlock(&journal->j_state_lock);
+ return;
+ }
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 0131e436253..626846bac32 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = err2;
/* Make sure all replayed data is on permanent storage */
- if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ if (journal->j_flags & JBD2_BARRIER) {
+ err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ if (!err)
+ err = err2;
+ }
return err;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index fb1ab9533b6..a74ba465954 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+ int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
- int ret;
BUFFER_TRACE(bh, "entry");
+retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* clear the buffer dirty bit at latest at the moment when the
* transaction marking the buffer as freed in the filesystem
* structures is committed because from that moment on the
- * buffer can be reallocated and used by a different page.
+ * block can be reallocated and used by a different page.
* Since the block hasn't been freed yet but the inode has
* already been added to orphan list, it is safe for us to add
* the buffer to BJ_Forget list of the newest transaction.
+ *
+ * Also we have to clear buffer_mapped flag of a truncated buffer
+ * because the buffer_head may be attached to the page straddling
+ * i_size (can happen only when blocksize < pagesize) and thus the
+ * buffer_head can be reused when the file is extended again. So we end
+ * up keeping around invalidated buffers attached to transactions'
+ * BJ_Forget list just to stop checkpointing code from cleaning up
+ * the transaction this buffer was modified in.
*/
transaction = jh->b_transaction;
if (transaction == NULL) {
@@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_running_transaction);
- jbd2_journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- write_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
@@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
- jbd2_journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- write_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
@@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
JBUFFER_TRACE(jh, "on committing transaction");
/*
* The buffer is committing, we simply cannot touch
- * it. So we just set j_next_transaction to the
- * running transaction (if there is one) and mark
- * buffer as freed so that commit code knows it should
- * clear dirty bits when it is done with the buffer.
+ * it. If the page is straddling i_size we have to wait
+ * for commit and try again.
+ */
+ if (partial_page) {
+ tid_t tid = journal->j_committing_transaction->t_tid;
+
+ jbd2_journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ write_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ goto retry;
+ }
+ /*
+ * OK, buffer won't be reachable after truncate. We just set
+ * j_next_transaction to the running transaction (if there is
+ * one) and mark buffer as freed so that commit code knows it
+ * should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
zap_buffer:
+ /*
+ * This is tricky. Although the buffer is truncated, it may be reused
+ * if blocksize < pagesize and it is attached to the page straddling
+ * EOF. Since the buffer might have been added to BJ_Forget list of the
+ * running transaction, journal_get_write_access() won't clear
+ * b_modified and credit accounting gets confused. So clear b_modified
+ * here.
+ */
+ jh->b_modified = 0;
jbd2_journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
@@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh);
+ may_free &= journal_unmap_buffer(journal, bh,
+ offset > 0);
unlock_buffer(bh);
}
curr_off = next_off;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1ea349fff68..ae81b01e6fd 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
}
/* Trivial function to remove the last node in the tree. Which by definition
- has no right-hand -- so can be removed just by making its only child (if
- any) take its place under its parent. */
+ has no right-hand child — so can be removed just by making its left-hand
+ child (if any) take its place under its parent. Since this is only done
+ when we're consuming the whole tree, there's no need to use rb_erase()
+ and let it worry about adjusting colours and balancing the tree. That
+ would just be a waste of time. */
static void eat_last(struct rb_root *root, struct rb_node *node)
{
struct rb_node *parent = rb_parent(node);
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node)
link = &parent->rb_right;
*link = node->rb_left;
- /* Colour doesn't matter now. Only the parent pointer. */
if (node->rb_left)
- node->rb_left->rb_parent_color = node->rb_parent_color;
+ node->rb_left->__rb_parent_color = node->__rb_parent_color;
}
-/* We put this in reverse order, so we can just use eat_last */
+/* We put the version tree in reverse order, so we can use the same eat_last()
+ function that we use to consume the tmpnode tree (tn_root). */
static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
{
struct rb_node **link = &ver_root->rb_node;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6a7fcab7ecb..f692be97676 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -578,6 +578,7 @@ out:
static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nfs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a4d56ac02e6..16f35f7423c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (unlikely(ret))
goto out;
+ file_update_time(vma->vm_file);
ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
@@ -134,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nilfs_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &nilfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d150372fd81..47a87dda54c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,6 +173,7 @@ out:
static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 2c6d95257a4..77e3cb2962b 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -146,8 +146,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
be64_to_cpu(entry->e_blocks);
if (omfs_allocate_block(inode->i_sb, new_block)) {
- entry->e_blocks =
- cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+ be64_add_cpu(&entry->e_blocks, 1);
terminator->e_blocks = ~(cpu_to_be64(
be64_to_cpu(~terminator->e_blocks) + 1));
goto out;
@@ -177,7 +176,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
/* write in new entry */
- oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+ be32_add_cpu(&oe->e_extent_count, 1);
out:
*ret_block = new_block;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d295af99367..ef5c84be66f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = {
.release = mem_release,
};
-static ssize_t oom_adjust_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char buffer[PROC_NUMBUF];
- size_t len;
- int oom_adjust = OOM_DISABLE;
- unsigned long flags;
-
- if (!task)
- return -ESRCH;
-
- if (lock_task_sighand(task, &flags)) {
- oom_adjust = task->signal->oom_adj;
- unlock_task_sighand(task, &flags);
- }
-
- put_task_struct(task);
-
- len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
-
- return simple_read_from_buffer(buf, count, ppos, buffer, len);
-}
-
-static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *task;
- char buffer[PROC_NUMBUF];
- int oom_adjust;
- unsigned long flags;
- int err;
-
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count)) {
- err = -EFAULT;
- goto out;
- }
-
- err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
- if (err)
- goto out;
- if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
- oom_adjust != OOM_DISABLE) {
- err = -EINVAL;
- goto out;
- }
-
- task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task) {
- err = -ESRCH;
- goto out;
- }
-
- task_lock(task);
- if (!task->mm) {
- err = -EINVAL;
- goto err_task_lock;
- }
-
- if (!lock_task_sighand(task, &flags)) {
- err = -ESRCH;
- goto err_task_lock;
- }
-
- if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- err = -EACCES;
- goto err_sighand;
- }
-
- /*
- * Warn that /proc/pid/oom_adj is deprecated, see
- * Documentation/feature-removal-schedule.txt.
- */
- printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
- current->comm, task_pid_nr(current), task_pid_nr(task),
- task_pid_nr(task));
- task->signal->oom_adj = oom_adjust;
- /*
- * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
- * value is always attainable.
- */
- if (task->signal->oom_adj == OOM_ADJUST_MAX)
- task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
- else
- task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
- -OOM_DISABLE;
- trace_oom_score_adj_update(task);
-err_sighand:
- unlock_task_sighand(task, &flags);
-err_task_lock:
- task_unlock(task);
- put_task_struct(task);
-out:
- return err < 0 ? err : count;
-}
-
-static const struct file_operations proc_oom_adjust_operations = {
- .read = oom_adjust_read,
- .write = oom_adjust_write,
- .llseek = generic_file_llseek,
-};
-
static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = oom_score_adj;
trace_oom_score_adj_update(task);
- /*
- * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
- * always attainable.
- */
- if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- task->signal->oom_adj = OOM_DISABLE;
- else
- task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
- OOM_SCORE_ADJ_MAX;
+
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
@@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b3647fe6a60..0d80cef4cfb 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
pde_get(de);
spin_unlock(&proc_subdir_lock);
- error = -EINVAL;
+ error = -ENOMEM;
inode = proc_get_inode(dir->i_sb, de);
goto out_unlock;
}
@@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
unsigned int len;
/* make sure name is valid */
- if (!name || !strlen(name)) goto out;
+ if (!name || !strlen(name))
+ goto out;
if (xlate_proc_name(name, parent, &fn) != 0)
goto out;
@@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
len = strlen(fn);
- ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
- if (!ent) goto out;
+ ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+ if (!ent)
+ goto out;
- memset(ent, 0, sizeof(struct proc_dir_entry));
memcpy(ent->name, fn, len + 1);
ent->namelen = len;
ent->mode = mode;
ent->nlink = nlink;
atomic_set(&ent->count, 1);
- ent->pde_users = 0;
spin_lock_init(&ent->pde_unload_lock);
- ent->pde_unload_completion = NULL;
INIT_LIST_HEAD(&ent->pde_openers);
- out:
+out:
return ent;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ac817b64a7..3b22bbdee9e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return NULL;
if (inode->i_state & I_NEW) {
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- PROC_I(inode)->fd = 0;
PROC_I(inode)->pde = de;
if (de->mode) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 67925a7bd8c..cceaab07ad5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -103,7 +103,7 @@ static inline int task_dumpable(struct task_struct *task)
if (mm)
dumpable = get_dumpable(mm);
task_unlock(task);
- if(dumpable == 1)
+ if (dumpable == SUID_DUMPABLE_ENABLED)
return 1;
return 0;
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d60a96..b8730d9ebae 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
- else if (PageTransCompound(page))
+ /*
+ * PageTransCompound can be true for non-huge compound pages (slab
+ * pages or pages allocated by drivers with __GFP_COMP) because it
+ * just checks PG_head/PG_tail, so we need to check PageLRU to make
+ * sure a given page is a thp, not a non-huge compound page.
+ */
+ else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
u |= 1 << KPF_THP;
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index eb7cc91b725..a781bdf0669 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
}
rb_link_node(node, parent, p);
+ rb_insert_color(node, &head->parent->root);
return 0;
}
@@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head,
head->node = node;
if (node) {
struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++) {
- rb_init_node(&node->node);
+ for (entry = table; entry->procname; entry++, node++)
node->header = head;
- }
}
}
@@ -266,8 +265,7 @@ void sysctl_head_put(struct ctl_table_header *head)
static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
{
- if (!head)
- BUG();
+ BUG_ON(!head);
spin_lock(&sysctl_lock);
if (!use_table(head))
head = ERR_PTR(-ENOENT);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a2d9fd7cad..9889a92d2e0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
if (!*p)
continue;
- args[0].to = args[0].from = 0;
+ args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
switch (token) {
case Opt_gid:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4540b8f76f1..79827ce03e3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPTE:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
- (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+ total_vm << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index d39bb5cce88..ca71db69da0 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -23,6 +23,7 @@ config PSTORE_FTRACE
bool "Persistent function tracer"
depends on PSTORE
depends on FUNCTION_TRACER
+ depends on DEBUG_FS
help
With this option kernel traces function calls into a persistent
ram buffer that can be decoded and dumped after reboot through
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index a130d484b7d..2d57e1ac011 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -17,19 +17,113 @@
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/cache.h>
#include <asm/barrier.h>
#include "internal.h"
-void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip)
+static void notrace pstore_ftrace_call(unsigned long ip,
+ unsigned long parent_ip)
{
+ unsigned long flags;
struct pstore_ftrace_record rec = {};
if (unlikely(oops_in_progress))
return;
+ local_irq_save(flags);
+
rec.ip = ip;
rec.parent_ip = parent_ip;
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
sizeof(rec), psinfo);
+
+ local_irq_restore(flags);
+}
+
+static struct ftrace_ops pstore_ftrace_ops __read_mostly = {
+ .func = pstore_ftrace_call,
+};
+
+static DEFINE_MUTEX(pstore_ftrace_lock);
+static bool pstore_ftrace_enabled;
+
+static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u8 on;
+ ssize_t ret;
+
+ ret = kstrtou8_from_user(buf, count, 2, &on);
+ if (ret)
+ return ret;
+
+ mutex_lock(&pstore_ftrace_lock);
+
+ if (!on ^ pstore_ftrace_enabled)
+ goto out;
+
+ if (on)
+ ret = register_ftrace_function(&pstore_ftrace_ops);
+ else
+ ret = unregister_ftrace_function(&pstore_ftrace_ops);
+ if (ret) {
+ pr_err("%s: unable to %sregister ftrace ops: %zd\n",
+ __func__, on ? "" : "un", ret);
+ goto err;
+ }
+
+ pstore_ftrace_enabled = on;
+out:
+ ret = count;
+err:
+ mutex_unlock(&pstore_ftrace_lock);
+
+ return ret;
+}
+
+static ssize_t pstore_ftrace_knob_read(struct file *f, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char val[] = { '0' + pstore_ftrace_enabled, '\n' };
+
+ return simple_read_from_buffer(buf, count, ppos, val, sizeof(val));
+}
+
+static const struct file_operations pstore_knob_fops = {
+ .open = simple_open,
+ .read = pstore_ftrace_knob_read,
+ .write = pstore_ftrace_knob_write,
+};
+
+void pstore_register_ftrace(void)
+{
+ struct dentry *dir;
+ struct dentry *file;
+
+ if (!psinfo->write_buf)
+ return;
+
+ dir = debugfs_create_dir("pstore", NULL);
+ if (!dir) {
+ pr_err("%s: unable to create pstore directory\n", __func__);
+ return;
+ }
+
+ file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
+ &pstore_knob_fops);
+ if (!file) {
+ pr_err("%s: unable to create record_ftrace file\n", __func__);
+ goto err_file;
+ }
+
+ return;
+err_file:
+ debugfs_remove(dir);
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 0d0d3b7d5f1..4847f588b7d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -39,6 +39,12 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
#endif
}
+#ifdef CONFIG_PSTORE_FTRACE
+extern void pstore_register_ftrace(void);
+#else
+static inline void pstore_register_ftrace(void) {}
+#endif
+
extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 29996e8793a..a40da07e93d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -164,7 +164,13 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
if (c > psinfo->bufsize)
c = psinfo->bufsize;
- spin_lock_irqsave(&psinfo->buf_lock, flags);
+
+ if (oops_in_progress) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
+ break;
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
+ }
memcpy(psinfo->buf, s, c);
psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
@@ -236,6 +242,7 @@ int pstore_register(struct pstore_info *psi)
kmsg_dump_register(&pstore_dumper);
pstore_register_console();
+ pstore_register_ftrace();
if (pstore_update_ms >= 0) {
pstore_timer.expires = jiffies +
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 0b311bc1891..1a4f6da58ea 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -32,6 +32,7 @@
#include <linux/ioport.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <linux/pstore_ram.h>
#define RAMOOPS_KERNMSG_HDR "===="
@@ -181,12 +182,11 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
return len;
}
-
-static int ramoops_pstore_write_buf(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char *buf, size_t size,
- struct pstore_info *psi)
+static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id, unsigned int part,
+ const char *buf, size_t size,
+ struct pstore_info *psi)
{
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt];
@@ -406,7 +406,7 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
goto fail_init_fprz;
if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
- pr_err("memory size too small, minimum is %lu\n",
+ pr_err("memory size too small, minimum is %zu\n",
cxt->console_size + cxt->record_size +
cxt->ftrace_size);
goto fail_cnt;
@@ -414,13 +414,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
cxt->pstore.data = cxt;
/*
- * Console can handle any buffer size, so prefer dumps buffer
- * size since usually it is smaller.
+ * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
+ * have to handle dumps, we must have at least record_size buffer. And
+ * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be
+ * ZERO_SIZE_PTR).
*/
- if (cxt->przs)
- cxt->pstore.bufsize = cxt->przs[0]->buffer_size;
- else
- cxt->pstore.bufsize = cxt->cprz->buffer_size;
+ if (cxt->console_size)
+ cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */
+ cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize);
cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL);
spin_lock_init(&cxt->pstore.buf_lock);
if (!cxt->pstore.buf) {
@@ -537,6 +538,7 @@ postcore_initcall(ramoops_init);
static void __exit ramoops_exit(void)
{
platform_driver_unregister(&ramoops_driver);
+ platform_device_unregister(dummy);
kfree(dummy_data);
}
module_exit(ramoops_exit);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d319963aeb1..c196369fe40 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -896,7 +896,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-const struct xattr_handler *reiserfs_xattr_handlers[] = {
+static const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
diff --git a/fs/super.c b/fs/super.c
index 5fdf7ff32c4..a3bc935069d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -865,7 +865,7 @@ int get_anon_bdev(dev_t *p)
else if (error)
return -EAGAIN;
- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+ if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
spin_lock(&unnamed_dev_lock);
ida_remove(&unnamed_dev_ida, dev);
if (unnamed_dev_start > dev)
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index ff48c5a8530..5bc77817f38 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,6 +1536,7 @@ out_unlock:
static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = ubifs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d1c6093fd3d..77b5953eaac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -118,11 +118,20 @@ static int udf_adinicb_write_end(struct file *file,
return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
}
+static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs)
+{
+ /* Fallback to buffered I/O. */
+ return 0;
+}
+
const struct address_space_operations udf_adinicb_aops = {
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
.write_end = udf_adinicb_write_end,
+ .direct_IO = udf_adinicb_direct_IO,
};
static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 287ef9f587b..df88b957ccf 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -95,11 +95,33 @@ void udf_evict_inode(struct inode *inode)
}
}
+static void udf_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ loff_t isize = inode->i_size;
+
+ if (to > isize) {
+ truncate_pagecache(inode, to, isize);
+ if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+ down_write(&iinfo->i_data_sem);
+ udf_truncate_extents(inode);
+ up_write(&iinfo->i_data_sem);
+ }
+ }
+}
+
static int udf_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, udf_get_block, wbc);
}
+static int udf_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return mpage_writepages(mapping, wbc, udf_get_block);
+}
+
static int udf_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, udf_get_block);
@@ -118,21 +140,24 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
int ret;
ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
- if (unlikely(ret)) {
- struct inode *inode = mapping->host;
- struct udf_inode_info *iinfo = UDF_I(inode);
- loff_t isize = inode->i_size;
-
- if (pos + len > isize) {
- truncate_pagecache(inode, pos + len, isize);
- if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
- down_write(&iinfo->i_data_sem);
- udf_truncate_extents(inode);
- up_write(&iinfo->i_data_sem);
- }
- }
- }
+ if (unlikely(ret))
+ udf_write_failed(mapping, pos + len);
+ return ret;
+}
+static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+
+ ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ udf_get_block);
+ if (unlikely(ret < 0 && (rw & WRITE)))
+ udf_write_failed(mapping, offset + iov_length(iov, nr_segs));
return ret;
}
@@ -145,8 +170,10 @@ const struct address_space_operations udf_aops = {
.readpage = udf_readpage,
.readpages = udf_readpages,
.writepage = udf_writepage,
- .write_begin = udf_write_begin,
- .write_end = generic_write_end,
+ .writepages = udf_writepages,
+ .write_begin = udf_write_begin,
+ .write_end = generic_write_end,
+ .direct_IO = udf_direct_IO,
.bmap = udf_bmap,
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1eaeb8be3aa..aa473fa640a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -940,7 +940,6 @@ xfs_file_mmap(
struct vm_area_struct *vma)
{
vma->vm_ops = &xfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
file_accessed(filp);
return 0;
@@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = xfs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};